library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
DataCamp offer interactive courses related to R Programming. While some is review, it is helpful to see other perspectives on material. As well, DataCamp has some interesting materials on packages that I want to learn better (ggplot2, dplyr, ggvis, etc.). This document summarizes a few key insights from:
This document is currently split between _v003 and _v003_a and _v003_b and _v003_c due to the need to keep the number of DLL that it opens below the hard-coded maximum. This introductory section needs to be re-written, and the contents consolidated, at a future date.
The original DataCamp_Insights_v001 and DataCamp_Insights_v002 documents have been split for this document:
Chapter 1 - Overview and Introduction
What is a hierarchical model?
Parts of a regression:
Random effects in regression:
School data:
Example code includes:
rawStudent <- read.csv("./RInputFiles/classroom.csv")
studentData <- rawStudent %>%
mutate(sex=factor(sex, labels=c("male", "female")), minority=factor(minority, labels=c("no", "yes")))
# Plot the data
ggplot(data = studentData, aes(x = housepov, y = mathgain)) +
geom_point() +
geom_smooth(method = 'lm')
# Fit a linear model
summary( lm(mathgain ~ housepov , data = studentData))
##
## Call:
## lm(formula = mathgain ~ housepov, data = studentData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -168.226 -22.222 -1.306 19.763 195.156
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.937 1.674 34.02 <2e-16 ***
## housepov 3.531 7.515 0.47 0.639
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.63 on 1188 degrees of freedom
## Multiple R-squared: 0.0001858, Adjusted R-squared: -0.0006558
## F-statistic: 0.2208 on 1 and 1188 DF, p-value: 0.6385
# I have aggregated the data for you into two new datasets at the classroom- and school-levels (As a side note, if you want to learn how to aggregate data, the dplyr or data.table courses teach these skills)
# We will also compare the model outputs across all three outputs
# Note: how we aggregate the data is important
# I aggregated the data by taking the mean across the student data (in pseudo-code: mean(mathgain) by school or mean(mathgain) by classroom),
# but another reasonable method for aggregating the data would be to aggregate by classroom first and school second
classData <- studentData %>%
group_by(schoolid, classid) %>%
summarize_at(vars(mathgain, mathprep, housepov, yearstea), mean, na.rm=TRUE)
str(classData)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 312 obs. of 6 variables:
## $ schoolid: int 1 1 2 2 2 3 3 3 3 4 ...
## $ classid : int 160 217 197 211 307 11 137 145 228 48 ...
## $ mathgain: num 65.7 57.4 49.5 69 68.8 ...
## $ mathprep: num 2 3.25 2.5 2.33 2.3 3.83 2.25 3 2.17 2 ...
## $ housepov: num 0.082 0.082 0.082 0.082 0.082 0.086 0.086 0.086 0.086 0.365 ...
## $ yearstea: num 1 2 1 2 12.5 ...
## - attr(*, "groups")=Classes 'tbl_df', 'tbl' and 'data.frame': 107 obs. of 2 variables:
## ..$ schoolid: int 1 2 3 4 5 6 7 8 9 10 ...
## ..$ .rows :List of 107
## .. ..$ : int 1 2
## .. ..$ : int 3 4 5
## .. ..$ : int 6 7 8 9
## .. ..$ : int 10 11
## .. ..$ : int 12
## .. ..$ : int 13 14 15
## .. ..$ : int 16 17 18 19
## .. ..$ : int 20 21 22
## .. ..$ : int 23 24 25
## .. ..$ : int 26 27 28 29
## .. ..$ : int 30 31 32 33 34 35 36 37 38
## .. ..$ : int 39 40 41 42 43
## .. ..$ : int 44 45
## .. ..$ : int 46 47 48
## .. ..$ : int 49 50 51 52 53
## .. ..$ : int 54 55
## .. ..$ : int 56 57 58 59 60
## .. ..$ : int 61
## .. ..$ : int 62 63 64
## .. ..$ : int 65 66 67
## .. ..$ : int 68 69 70
## .. ..$ : int 71
## .. ..$ : int 72 73 74
## .. ..$ : int 75 76 77 78
## .. ..$ : int 79 80
## .. ..$ : int 81 82 83
## .. ..$ : int 84 85 86 87
## .. ..$ : int 88 89 90 91
## .. ..$ : int 92 93
## .. ..$ : int 94
## .. ..$ : int 95 96 97 98
## .. ..$ : int 99 100 101
## .. ..$ : int 102 103 104 105 106
## .. ..$ : int 107 108
## .. ..$ : int 109 110
## .. ..$ : int 111 112
## .. ..$ : int 113 114 115 116
## .. ..$ : int 117 118
## .. ..$ : int 119 120 121 122
## .. ..$ : int 123 124
## .. ..$ : int 125 126 127
## .. ..$ : int 128 129 130 131
## .. ..$ : int 132 133
## .. ..$ : int 134 135 136 137
## .. ..$ : int 138
## .. ..$ : int 139 140 141 142 143
## .. ..$ : int 144 145 146
## .. ..$ : int 147
## .. ..$ : int 148 149
## .. ..$ : int 150 151 152
## .. ..$ : int 153
## .. ..$ : int 154 155
## .. ..$ : int 156 157
## .. ..$ : int 158 159
## .. ..$ : int 160 161 162
## .. ..$ : int 163 164
## .. ..$ : int 165 166 167 168 169
## .. ..$ : int 170
## .. ..$ : int 171 172
## .. ..$ : int 173 174
## .. ..$ : int 175 176 177 178
## .. ..$ : int 179 180
## .. ..$ : int 181
## .. ..$ : int 182 183
## .. ..$ : int 184 185 186
## .. ..$ : int 187 188 189
## .. ..$ : int 190 191 192
## .. ..$ : int 193 194 195 196 197
## .. ..$ : int 198 199
## .. ..$ : int 200 201 202 203 204
## .. ..$ : int 205 206 207 208 209
## .. ..$ : int 210 211 212
## .. ..$ : int 213 214
## .. ..$ : int 215 216
## .. ..$ : int 217 218 219 220
## .. ..$ : int 221 222 223 224 225
## .. ..$ : int 226 227 228 229
## .. ..$ : int 230 231 232
## .. ..$ : int 233 234 235
## .. ..$ : int 236 237
## .. ..$ : int 238 239
## .. ..$ : int 240 241 242 243
## .. ..$ : int 244 245
## .. ..$ : int 246 247 248
## .. ..$ : int 249 250 251 252 253
## .. ..$ : int 254 255 256
## .. ..$ : int 257 258 259 260
## .. ..$ : int 261 262
## .. ..$ : int 263
## .. ..$ : int 264 265
## .. ..$ : int 266 267 268 269
## .. ..$ : int 270 271 272
## .. ..$ : int 273 274 275 276
## .. ..$ : int 277 278 279 280
## .. ..$ : int 281 282
## .. ..$ : int 283 284 285
## .. ..$ : int 286
## .. ..$ : int 287
## .. ..$ : int 288 289 290 291 292
## .. .. [list output truncated]
## ..- attr(*, ".drop")= logi TRUE
schoolData <- studentData %>%
group_by(schoolid) %>%
summarize_at(vars(mathgain, mathprep, housepov, yearstea), mean, na.rm=TRUE)
str(schoolData)
## Classes 'tbl_df', 'tbl' and 'data.frame': 107 obs. of 5 variables:
## $ schoolid: int 1 2 3 4 5 6 7 8 9 10 ...
## $ mathgain: num 59.6 65 88.9 35.2 60.2 ...
## $ mathprep: num 2.91 2.35 2.95 2 3.75 ...
## $ housepov: num 0.082 0.082 0.086 0.365 0.511 0.044 0.148 0.085 0.537 0.346 ...
## $ yearstea: num 1.73 6.02 14.93 22 3 ...
# First, plot the hosepov and mathgain at the classroom-level from the classData data.frame
ggplot(data = classData, aes(x = housepov, y = mathgain)) +
geom_point() +
geom_smooth(method = 'lm')
# Second, plot the hosepov and mathgain at the school-level from the schoolData data.frame
ggplot(data = schoolData, aes(x = housepov, y = mathgain)) +
geom_point() +
geom_smooth(method = 'lm')
# Third, compare your liner regression results from the previous expercise to the two new models
summary( lm(mathgain ~ housepov, data = studentData)) ## student-level data
##
## Call:
## lm(formula = mathgain ~ housepov, data = studentData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -168.226 -22.222 -1.306 19.763 195.156
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.937 1.674 34.02 <2e-16 ***
## housepov 3.531 7.515 0.47 0.639
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.63 on 1188 degrees of freedom
## Multiple R-squared: 0.0001858, Adjusted R-squared: -0.0006558
## F-statistic: 0.2208 on 1 and 1188 DF, p-value: 0.6385
summary( lm(mathgain ~ housepov, data = classData)) ## class-level data
##
## Call:
## lm(formula = mathgain ~ housepov, data = classData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80.479 -14.444 -1.447 13.151 156.516
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.160 2.542 22.879 <2e-16 ***
## housepov -3.236 10.835 -0.299 0.765
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.14 on 310 degrees of freedom
## Multiple R-squared: 0.0002876, Adjusted R-squared: -0.002937
## F-statistic: 0.08918 on 1 and 310 DF, p-value: 0.7654
summary( lm(mathgain ~ housepov, data = schoolData)) ## school-level data
##
## Call:
## lm(formula = mathgain ~ housepov, data = schoolData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.660 -9.947 -2.494 9.546 41.445
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59.338 2.624 22.616 <2e-16 ***
## housepov -11.948 10.987 -1.087 0.279
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.8 on 105 degrees of freedom
## Multiple R-squared: 0.01114, Adjusted R-squared: 0.00172
## F-statistic: 1.183 on 1 and 105 DF, p-value: 0.2793
# Plot the means of your data, predictor is your x-variable, response is your y-variable, and intDemo is your data.frame
intDemo <- data.frame(predictor=factor(c(rep("a", 5), rep("b", 5), rep("c", 5))),
response=c(-1.207, 0.277, 1.084, -2.346, 0.429, 5.759, 4.138, 4.18, 4.153, 3.665, 9.046, 8.003, 8.447, 10.129, 11.919)
)
str(intDemo)
## 'data.frame': 15 obs. of 2 variables:
## $ predictor: Factor w/ 3 levels "a","b","c": 1 1 1 1 1 2 2 2 2 2 ...
## $ response : num -1.207 0.277 1.084 -2.346 0.429 ...
ggIntDemo <- ggplot(intDemo, aes(x = predictor, y = response) ) +
geom_point() +
theme_minimal() + stat_summary(fun.y = "mean", color = "red",
size = 3, geom = "point") +
xlab("Intercept groups")
print(ggIntDemo)
# Fit a linear model to your data where response is "predicted by"(~) predictor
intModel <- lm( response ~ predictor - 1 , data = intDemo)
summary(intModel)
##
## Call:
## lm(formula = response ~ predictor - 1, data = intDemo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9934 -0.7842 -0.2260 0.7056 2.4102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## predictora -0.3526 0.5794 -0.609 0.554
## predictorb 4.3790 0.5794 7.558 6.69e-06 ***
## predictorc 9.5088 0.5794 16.412 1.38e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.296 on 12 degrees of freedom
## Multiple R-squared: 0.9646, Adjusted R-squared: 0.9557
## F-statistic: 109 on 3 and 12 DF, p-value: 5.696e-09
extractAndPlotResults <- function(intModel){
intCoefPlot <- broom::tidy(intModel)
intCoefPlot$term <- factor(gsub("predictor", "", intCoefPlot$term))
plotOut <- ggIntDemo + geom_point(data = intCoefPlot,
aes(x = term, y = estimate),
position = position_dodge(width = 0.4),
color = 'blue', size = 8, alpha = 0.25)
print(plotOut)
}
# Run the next code that extracts out the model's coeffiecents and plots them
extractAndPlotResults(intModel)
multIntDemo <- data.frame(group=factor(c(rep("a", 5), rep("b", 5), rep("c", 5))),
x=rep(0:4, times=3),
intercept=c(4.11, -1.69, 1.09, 1.9, 1.21, 4.63, 10.29, 4.67, 12.06, 4.78, 15.22, 19.15, 4.44, 8.88, 9.47),
response=c(4.11, 2.31, 9.09, 13.9, 17.21, 4.63, 14.29, 12.67, 24.06, 20.78, 15.22, 23.15, 12.44, 20.88, 25.47)
)
str(multIntDemo)
## 'data.frame': 15 obs. of 4 variables:
## $ group : Factor w/ 3 levels "a","b","c": 1 1 1 1 1 2 2 2 2 2 ...
## $ x : int 0 1 2 3 4 0 1 2 3 4 ...
## $ intercept: num 4.11 -1.69 1.09 1.9 1.21 ...
## $ response : num 4.11 2.31 9.09 13.9 17.21 ...
plot_output1 <- function(out1){
ggmultIntgDemo <- ggplot( multIntDemo, aes(x = x, y = response) ) +
geom_point(aes(color = group)) +
theme_minimal() +
scale_color_manual(values = c("blue", "black", "red")) +
stat_smooth(method = 'lm', fill = NA, color = 'orange', size = 3)
print(ggmultIntgDemo)
}
plot_output2 <- function(out2){
out2Tidy <- broom::tidy(out2)
out2Tidy$term <- gsub("group", "", out2Tidy$term)
out2Plot <- data.frame(group = pull(out2Tidy[ -1, 1]),
slope = pull(out2Tidy[ 1, 2]),
intercept = pull(out2Tidy[ -1, 2])
)
ggmultIntgDemo2 <- ggplot( multIntDemo, aes(x = x, y = response) ) +
geom_point(aes(color = group))+
theme_minimal() +
scale_color_manual(values = c("blue", "black", "red")) +
geom_abline(data = out2Plot,
aes(intercept = intercept, slope = slope, color = group))
print(ggmultIntgDemo2)
}
plot_output3 <- function(out3){
ggmultIntgDemo3 <- ggplot( multIntDemo, aes(x = x, y = response) ) +
geom_point(aes(color = group)) +
theme_minimal() +
scale_color_manual(values = c("blue", "black", "red")) +
stat_smooth(method = 'lm', aes(color = group), fill = NA)
print(ggmultIntgDemo3)
}
# First, run a model without considering different intercept for each group
out1 <- lm( response ~ x, data=multIntDemo )
summary(out1)
##
## Call:
## lm(formula = response ~ x, data = multIntDemo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.101 -4.021 -2.011 3.590 11.739
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.141 2.615 3.113 0.00824 **
## x 3.270 1.068 3.062 0.00908 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.848 on 13 degrees of freedom
## Multiple R-squared: 0.4191, Adjusted R-squared: 0.3744
## F-statistic: 9.378 on 1 and 13 DF, p-value: 0.009081
plot_output1(out1)
# Considering same slope but different intercepts
out2 <- lm( response ~ x + group - 1, data=multIntDemo )
summary(out2)
##
## Call:
## lm(formula = response ~ x + group - 1, data = multIntDemo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.992 -2.219 -0.234 1.810 6.988
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 3.2697 0.7516 4.350 0.001155 **
## groupa 2.7847 2.3767 1.172 0.266085
## groupb 8.7467 2.3767 3.680 0.003625 **
## groupc 12.8927 2.3767 5.425 0.000209 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.117 on 11 degrees of freedom
## Multiple R-squared: 0.9534, Adjusted R-squared: 0.9364
## F-statistic: 56.23 on 4 and 11 DF, p-value: 2.97e-07
plot_output2(out2)
# Consdering different slope and intercept for each group (i.e., an interaction)
out3 <- lm( response ~ x + group - 1 + x:group, multIntDemo)
summary(out3)
##
## Call:
## lm(formula = response ~ x + group - 1 + x:group, data = multIntDemo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.992 -2.429 -0.234 2.368 5.541
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 3.779 1.308 2.888 0.017941 *
## groupa 1.766 3.205 0.551 0.595053
## groupb 6.872 3.205 2.144 0.060621 .
## groupc 15.786 3.205 4.925 0.000819 ***
## x:groupb 0.428 1.851 0.231 0.822263
## x:groupc -1.956 1.851 -1.057 0.318050
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.138 on 9 degrees of freedom
## Multiple R-squared: 0.9615, Adjusted R-squared: 0.9358
## F-statistic: 37.42 on 6 and 9 DF, p-value: 7.263e-06
plot_output3(out3)
multIntDemo$intercept <- c(-0.87, 3.35, 1.25, 0.88, -1.05, 4.55, 1.22, 3.34, 1.26, 3.75, 7.71, 9.59, 2.28, 1.9, 13.35)
multIntDemo$response <- c(-0.87, 6.35, 7.25, 9.88, 10.95, 4.55, 4.22, 9.34, 10.26, 15.75, 7.71, 12.59, 8.28, 10.9, 25.35)
# Run model
outLmer <- lme4::lmer( response ~ x + ( 1 | group), multIntDemo)
# Look at model outputs
summary( outLmer )
## Linear mixed model fit by REML ['lmerMod']
## Formula: response ~ x + (1 | group)
## Data: multIntDemo
##
## REML criterion at convergence: 76.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.31584 -0.61104 -0.01592 0.45125 2.19118
##
## Random effects:
## Groups Name Variance Std.Dev.
## group (Intercept) 7.98 2.825
## Residual 10.71 3.272
## Number of obs: 15, groups: group, 3
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 3.5540 2.1913 1.622
## x 2.9733 0.5975 4.977
##
## Correlation of Fixed Effects:
## (Intr)
## x -0.545
broom::tidy( outLmer )
## Warning in bind_rows_(x, .id): binding factor and character vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## # A tibble: 4 x 5
## term estimate std.error statistic group
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 (Intercept) 3.55 2.19 1.62 fixed
## 2 x 2.97 0.597 4.98 fixed
## 3 sd_(Intercept).group 2.82 NA NA group
## 4 sd_Observation.Residual 3.27 NA NA Residual
extractAndPlotOutput <- function(outLmer, slope=3){
multIntDemo$lmerPredict <- predict(outLmer)
ggmultIntgDemo2 <- ggplot( multIntDemo, aes(x = x, y = response) ) +
geom_point(aes(color = group))+
theme_minimal() +
scale_color_manual(values = c("blue", "black", "red")) +
geom_abline(data = multIntDemo,
aes(intercept = intercept, slope = slope, color = group))
outPlot <- ggmultIntgDemo2 +
geom_line( data = multIntDemo,
aes(x = x, y = lmerPredict, color = group),
linetype = 2)
print(outPlot)
}
# Extract predictor variables and plot
extractAndPlotOutput(outLmer)
# Random effect slopes
multIntDemo$response <- c(-0.72, 1.5, 4.81, 6.61, 13.62, 10.21, 9.64, 11.91, 16.39, 16.97, 8.76, 14.79, 15.83, 15.27, 17.36)
multIntDemo$intercept <- c(-0.72, -1.5, -1.19, -2.39, 1.62, 10.21, 6.64, 5.91, 7.39, 4.97, 8.76, 11.79, 9.83, 6.27, 5.36)
outLmer2 <- lme4::lmer( response ~ ( x|group ), multIntDemo)
## boundary (singular) fit: see ?isSingular
summary(outLmer2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: response ~ (x | group)
## Data: multIntDemo
##
## REML criterion at convergence: 69.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.56739 -0.54097 -0.06276 0.75132 1.27928
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## group (Intercept) 273.766 16.546
## x 6.090 2.468 -1.00
## Residual 2.467 1.571
## Number of obs: 15, groups: group, 3
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 21.676 1.383 15.67
## convergence code: 0
## boundary (singular) fit: see ?isSingular
broom::tidy(outLmer2)
## Warning in bind_rows_(x, .id): binding factor and character vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## # A tibble: 5 x 5
## term estimate std.error statistic group
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 (Intercept) 21.7 1.38 15.7 fixed
## 2 sd_(Intercept).group 16.5 NA NA group
## 3 sd_x.group 2.47 NA NA group
## 4 cor_(Intercept).x.group -1 NA NA group
## 5 sd_Observation.Residual 1.57 NA NA Residual
plotOutput <- function(outLmer2){
multIntDemo$lmerPredict2 <- predict(outLmer2)
ggmultIntgDemo3 <- ggplot( multIntDemo, aes(x = x, y = response) ) +
geom_point(aes(color = group)) +
theme_minimal() +
scale_color_manual(values = c("blue", "black", "red")) +
stat_smooth(method = 'lm', aes(color = group), fill = NA)
plotOut <- ggmultIntgDemo3 +
geom_line( data = multIntDemo,
aes(x = x, y = lmerPredict2, color = group),
linetype = 2)
print(plotOut)
}
# Extract and plot
plotOutput(outLmer2)
# Mixed effect model
lmerModel <- lme4::lmer(mathgain ~ sex +
mathprep + mathknow + (1|classid) +
(1|schoolid), data = studentData, na.action = "na.omit",
REML = TRUE)
summary(lmerModel)
## Linear mixed model fit by REML ['lmerMod']
## Formula: mathgain ~ sex + mathprep + mathknow + (1 | classid) + (1 | schoolid)
## Data: studentData
##
## REML criterion at convergence: 10677.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.3203 -0.6146 -0.0294 0.5467 5.5331
##
## Random effects:
## Groups Name Variance Std.Dev.
## classid (Intercept) 103.57 10.177
## schoolid (Intercept) 85.44 9.244
## Residual 1019.47 31.929
## Number of obs: 1081, groups: classid, 285; schoolid, 105
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 52.250 3.838 13.613
## sexfemale -1.526 2.041 -0.747
## mathprep 2.426 1.298 1.869
## mathknow 2.405 1.299 1.851
##
## Correlation of Fixed Effects:
## (Intr) sexfml mthprp
## sexfemale -0.268
## mathprep -0.878 0.001
## mathknow -0.003 0.011 0.005
extractAndPlot <- function(lmerModel){
modelOutPlot <- broom::tidy(lmerModel, conf.int = TRUE)
modelOutPlot <- modelOutPlot[ modelOutPlot$group =="fixed" &
modelOutPlot$term != "(Intercept)", ]
plotOut <- ggplot(modelOutPlot, aes(x = term, y = estimate,
ymin = conf.low,
ymax = conf.high)) +
theme_minimal() +
geom_hline(yintercept = 0.0, color = 'red', size = 2.0) +
geom_point() +
geom_linerange() + coord_flip()
print(plotOut)
}
# Extract and plot
extractAndPlot(lmerModel)
## Warning in bind_rows_(x, .id): binding factor and character vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
Chapter 2 - Linear Mixed-Effect Models
Linear mixed effect model - Birth rates data:
Understanding and reporting the outputs of lmer:
Statistical inference with Maryland crime data:
Example code includes:
# Read in births data
rawBirths <- read.csv("./RInputFiles/countyBirthsDataUse.csv")
countyBirthsData <- rawBirths
str(countyBirthsData)
## 'data.frame': 580 obs. of 8 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ TotalPopulation : int 203709 115620 103057 104173 660367 156993 353089 415395 226519 119565 ...
## $ BirthRate : num 11.5 12.1 11.8 12.4 13.3 ...
## $ AverageBirthWeight: num 3261 3209 3239 3207 3177 ...
## $ AverageAgeofMother: num 27.5 26.3 25.8 26.9 27.9 ...
## $ CountyName : Factor w/ 472 levels "Ada","Adams",..: 22 64 141 189 200 229 248 273 278 279 ...
## $ State : Factor w/ 50 levels "AK","AL","AR",..: 2 2 2 2 2 2 2 2 2 2 ...
# First, build a lmer with state as a random effect. Then look at the model's summary and the plot of residuals.
birthRateStateModel <- lme4::lmer(BirthRate ~ (1|State), data=countyBirthsData)
summary(birthRateStateModel)
## Linear mixed model fit by REML ['lmerMod']
## Formula: BirthRate ~ (1 | State)
## Data: countyBirthsData
##
## REML criterion at convergence: 2411
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7957 -0.6056 -0.1063 0.5211 5.5948
##
## Random effects:
## Groups Name Variance Std.Dev.
## State (Intercept) 1.899 1.378
## Residual 3.256 1.804
## Number of obs: 578, groups: State, 50
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 12.3362 0.2216 55.67
plot(birthRateStateModel)
# Next, plot the predicted values from the model ontop of the plot shown during the video.
countyBirthsData$birthPredictState <- predict(birthRateStateModel, countyBirthsData)
ggplot() + theme_minimal() +
geom_point(data =countyBirthsData, aes(x = TotalPopulation, y = BirthRate)) +
geom_point(data = countyBirthsData, aes(x = TotalPopulation, y = birthPredictState),
color = 'blue', alpha = 0.5
)
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
# Include the AverageAgeofMother as a fixed effect within the lmer and state as a random effect
ageMotherModel <- lme4::lmer( BirthRate ~ AverageAgeofMother + (1|State), data=countyBirthsData)
summary(ageMotherModel)
## Linear mixed model fit by REML ['lmerMod']
## Formula: BirthRate ~ AverageAgeofMother + (1 | State)
## Data: countyBirthsData
##
## REML criterion at convergence: 2347.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9602 -0.6086 -0.1042 0.5144 5.2686
##
## Random effects:
## Groups Name Variance Std.Dev.
## State (Intercept) 1.562 1.250
## Residual 2.920 1.709
## Number of obs: 578, groups: State, 50
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 27.57033 1.81801 15.165
## AverageAgeofMother -0.53549 0.06349 -8.434
##
## Correlation of Fixed Effects:
## (Intr)
## AvrgAgfMthr -0.994
# Compare the random-effect model to the linear effect model
summary(lm(BirthRate ~ AverageAgeofMother, data = countyBirthsData))
##
## Call:
## lm(formula = BirthRate ~ AverageAgeofMother, data = countyBirthsData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8304 -1.3126 -0.1795 1.2198 8.7327
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.06637 1.83374 15.851 <2e-16 ***
## AverageAgeofMother -0.59380 0.06441 -9.219 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.065 on 576 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1286, Adjusted R-squared: 0.1271
## F-statistic: 84.99 on 1 and 576 DF, p-value: < 2.2e-16
# Include the AverageAgeofMother as a correlated random-effect slope parameter
ageMotherModelRandomCorrelated <- lme4::lmer(BirthRate ~ AverageAgeofMother + (AverageAgeofMother|State),
countyBirthsData)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.0133555 (tol = 0.002, component 1)
summary(ageMotherModelRandomCorrelated)
## Linear mixed model fit by REML ['lmerMod']
## Formula: BirthRate ~ AverageAgeofMother + (AverageAgeofMother | State)
## Data: countyBirthsData
##
## REML criterion at convergence: 2337.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.8402 -0.5965 -0.1132 0.5233 5.1817
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## State (Intercept) 78.33144 8.8505
## AverageAgeofMother 0.08433 0.2904 -0.99
## Residual 2.80345 1.6744
## Number of obs: 578, groups: State, 50
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 27.21961 2.41010 11.294
## AverageAgeofMother -0.52344 0.08293 -6.312
##
## Correlation of Fixed Effects:
## (Intr)
## AvrgAgfMthr -0.997
## convergence code: 0
## Model failed to converge with max|grad| = 0.0133555 (tol = 0.002, component 1)
# Include the AverageAgeofMother as a correlated random-effect slope parameter
ageMotherModelRandomUncorrelated <- lme4::lmer(BirthRate ~ AverageAgeofMother +
(AverageAgeofMother || State), data=countyBirthsData
)
## boundary (singular) fit: see ?isSingular
summary(ageMotherModelRandomUncorrelated)
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## BirthRate ~ AverageAgeofMother + ((1 | State) + (0 + AverageAgeofMother |
## State))
## Data: countyBirthsData
##
## REML criterion at convergence: 2347.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9602 -0.6086 -0.1042 0.5144 5.2686
##
## Random effects:
## Groups Name Variance Std.Dev.
## State (Intercept) 1.562 1.250
## State.1 AverageAgeofMother 0.000 0.000
## Residual 2.920 1.709
## Number of obs: 578, groups: State, 50
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 27.57031 1.81801 15.165
## AverageAgeofMother -0.53549 0.06349 -8.434
##
## Correlation of Fixed Effects:
## (Intr)
## AvrgAgfMthr -0.994
## convergence code: 0
## boundary (singular) fit: see ?isSingular
out <- ageMotherModelRandomUncorrelated
# Extract the fixed-effect coefficients
lme4::fixef(out)
## (Intercept) AverageAgeofMother
## 27.5703059 -0.5354876
# Extract the random-effect coefficients
lme4::ranef(out)
## $State
## (Intercept) AverageAgeofMother
## AK 1.03554361 0
## AL -0.52501630 0
## AR 0.48024018 0
## AZ -1.04095779 0
## CA 0.50530282 0
## CO 0.09585291 0
## CT -1.91641428 0
## DC 0.96034296 0
## DE -0.38939548 0
## FL -1.87441671 0
## GA 0.39776296 0
## HI 0.08513460 0
## IA 0.96281025 0
## ID 1.17380179 0
## IL -0.12739802 0
## IN -0.32655768 0
## KS 0.85651904 0
## KY 0.64872241 0
## LA 1.04438181 0
## MA -1.40084157 0
## MD 0.10842594 0
## ME -1.63524235 0
## MI -1.13798940 0
## MN 0.93266949 0
## MO 0.07081678 0
## MS -0.21398117 0
## MT -0.13190987 0
## NC -0.28681725 0
## ND 0.99852760 0
## NE 1.49394698 0
## NH -1.45444986 0
## NJ -0.30090199 0
## NM -0.69755039 0
## NV 0.09013066 0
## NY -0.58164079 0
## OH -1.07391197 0
## OK 0.77998608 0
## OR -0.75846975 0
## PA -1.59333857 0
## RI -1.36399831 0
## SC -0.59295913 0
## SD 1.35146914 0
## TN -0.13513429 0
## TX 1.70872778 0
## UT 3.66063407 0
## VA 1.59188509 0
## VT -0.51108138 0
## WA 0.23008247 0
## WI -0.51647561 0
## WV -0.67686749 0
##
## with conditional variances for "State"
# Estimate the confidence intervals
(ciOut <- confint(out))
## Computing profile confidence intervals ...
## 2.5 % 97.5 %
## .sig01 0.0000000 1.61214393
## .sig02 0.0000000 0.05033958
## .sigma 1.6093152 1.81561031
## (Intercept) 24.0121844 31.14669045
## AverageAgeofMother -0.6605319 -0.41123093
# Technical note: Extracting out the regression coefficients from lmer is tricky (see discussion between the lmer and broom authors development)
# Extract out the parameter estimates and confidence intervals and manipulate the data
dataPlot <- data.frame(cbind( lme4::fixef(out), ciOut[ 4:5, ]))
rownames(dataPlot)[1] <- "Intercept"
colnames(dataPlot) <- c("mean", "l95", "u95")
dataPlot$parameter <- rownames(dataPlot)
# Print the new dataframe
print(dataPlot)
## mean l95 u95 parameter
## Intercept 27.5703059 24.0121844 31.1466905 Intercept
## AverageAgeofMother -0.5354876 -0.6605319 -0.4112309 AverageAgeofMother
# Plot the results using ggplot2
ggplot(dataPlot, aes(x = parameter, y = mean,
ymin = l95, ymax = u95)) +
geom_hline( yintercept = 0, color = 'red' ) +
geom_linerange() + geom_point() + coord_flip() + theme_minimal()
# Read in crime data
rawCrime <- read.csv("./RInputFiles/MDCrime.csv")
MDCrime <- rawCrime
str(MDCrime)
## 'data.frame': 192 obs. of 5 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ County: Factor w/ 24 levels "ALLEGANY","ANNE ARUNDEL",..: 2 3 4 5 6 7 8 9 10 11 ...
## $ Year : int 2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
## $ Crime : int 3167 10871 5713 257 149 374 490 729 181 752 ...
## $ Year2 : int 0 0 0 0 0 0 0 0 0 0 ...
plot1 <- ggplot(data = MDCrime, aes(x = Year, y = Crime, group = County)) +
geom_line() + theme_minimal() +
ylab("Major crimes reported per county")
print(plot1)
plot1 + geom_smooth(method = 'lm')
# Null hypothesis testing uses p-values to see if a variable is "significant"
# Recently, the abuse and overuse of null hypothesis testing and p-values has caused the American Statistical Association to issue a statement about the use of p-values
# Because of these criticisms and other numerical challenges, Doug Bates (the creator of the lme4 package) does not include p-values as part of his package
# However, you may still want to estimate p-values, because p-values are sill commonly used. Several packages exist, including the lmerTest package
# https://www.amstat.org/asa/files/pdfs/P-ValueStatement.pdf
# Load lmerTest
# library(lmerTest)
# Fit the model with Year as both a fixed and random-effect
lme4::lmer(Crime ~ Year + (1 + Year | County) , data = MDCrime)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## unable to evaluate scaled gradient
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge: degenerate Hessian with 1 negative eigenvalues
## Linear mixed model fit by REML ['lmerMod']
## Formula: Crime ~ Year + (1 + Year | County)
## Data: MDCrime
## REML criterion at convergence: 2892.018
## Random effects:
## Groups Name Std.Dev. Corr
## County (Intercept) 386.29
## Year 1.34 -0.84
## Residual 328.32
## Number of obs: 192, groups: County, 24
## Fixed Effects:
## (Intercept) Year
## 136642.97 -67.33
## convergence code 0; 2 optimizer warnings; 0 lme4 warnings
# Fit the model with Year2 rather than Year
out <- lme4::lmer(Crime ~ Year2 + (1 + Year2 | County) , data = MDCrime)
# Examine the model's output
summary(out)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Crime ~ Year2 + (1 + Year2 | County)
## Data: MDCrime
##
## REML criterion at convergence: 2535.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.8080 -0.2235 -0.0390 0.2837 3.0767
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## County (Intercept) 7584514 2754.00
## Year2 16940 130.15 -0.91
## Residual 8425 91.79
## Number of obs: 192, groups: County, 24
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 1577.28 562.29 2.805
## Year2 -67.33 26.72 -2.519
##
## Correlation of Fixed Effects:
## (Intr)
## Year2 -0.906
## Build the Null model with only County as a random-effect
null_model <- lme4::lmer(Crime ~ (1 | County) , data = MDCrime)
## Build the Year2 model with Year2 as a fixed and random slope and County as the random-effect
year_model <- lme4::lmer(Crime ~ Year2 + (1 + Year2 | County) , data = MDCrime)
## Compare the two models using an anova
anova(null_model, year_model)
## refitting model(s) with ML (instead of REML)
## Data: MDCrime
## Models:
## null_model: Crime ~ (1 | County)
## year_model: Crime ~ Year2 + (1 + Year2 | County)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## null_model 3 2954.4 2964.2 -1474.2 2948.4
## year_model 6 2568.9 2588.4 -1278.4 2556.9 391.52 3 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Chapter 3 - Generalized Linear Mixed-Effect Models
Crash course on GLMs - relaxing the assumptions around normality of the residuals:
Binomial data - modeling data with only two outcomes:
Count data:
Example code includes:
# In this case study, we will be working with simulated dose-response data
# The response is mortality (1) or survival (0) at the end of a study. During this exercise, we will fit a logistic regression using all three methods described in the video
# You have been given two datasets. dfLong has the data in a "long" format with each row corresponding to an observation (i.e., a 0 or 1)
# dfShort has the data in an aggregated format with each row corresponding to a treatment (e.g., 6 successes, 4 failures, number of replicates = 10, proportion = 0.6)
dfLong <- data.frame(dose=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10),
mortality=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1)
)
str(dfLong)
## 'data.frame': 120 obs. of 2 variables:
## $ dose : num 0 0 0 0 0 0 0 0 0 0 ...
## $ mortality: num 0 0 0 0 0 0 0 0 0 0 ...
dfShort <- dfLong %>%
group_by(dose) %>%
summarize(mortality=sum(mortality), nReps=n()) %>%
mutate(survival=nReps-mortality, mortalityP=mortality/nReps)
dfShort
## # A tibble: 6 x 5
## dose mortality nReps survival mortalityP
## <dbl> <dbl> <int> <dbl> <dbl>
## 1 0 0 20 20.0 0
## 2 2.00 4.00 20 16.0 0.200
## 3 4.00 8.00 20 12.0 0.400
## 4 6.00 10.0 20 10.0 0.500
## 5 8.00 11.0 20 9.00 0.550
## 6 10.0 13.0 20 7.00 0.650
# Fit a glm using data in a long format
fitLong <- glm(mortality ~ dose, data = dfLong, family = "binomial")
summary(fitLong)
##
## Call:
## glm(formula = mortality ~ dose, family = "binomial", data = dfLong)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5916 -0.8245 -0.4737 1.0440 1.8524
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.13075 0.44532 -4.785 1.71e-06 ***
## dose 0.30663 0.06821 4.495 6.95e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 159.76 on 119 degrees of freedom
## Residual deviance: 134.71 on 118 degrees of freedom
## AIC: 138.71
##
## Number of Fisher Scoring iterations: 3
# Fit a glm using data in a short format with two columns
fitShort <- glm( cbind(mortality , survival ) ~ dose , data = dfShort, family = "binomial")
summary(fitShort)
##
## Call:
## glm(formula = cbind(mortality, survival) ~ dose, family = "binomial",
## data = dfShort)
##
## Deviance Residuals:
## 1 2 3 4 5 6
## -2.1186 0.2316 1.0698 0.6495 -0.2699 -0.6634
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.13075 0.44537 -4.784 1.72e-06 ***
## dose 0.30663 0.06822 4.495 6.97e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 31.6755 on 5 degrees of freedom
## Residual deviance: 6.6214 on 4 degrees of freedom
## AIC: 27.415
##
## Number of Fisher Scoring iterations: 4
# Fit a glm using data in a short format with weights
fitShortP <- glm( mortalityP ~ dose , data = dfShort, weights = nReps , family = "binomial")
summary(fitShortP)
##
## Call:
## glm(formula = mortalityP ~ dose, family = "binomial", data = dfShort,
## weights = nReps)
##
## Deviance Residuals:
## 1 2 3 4 5 6
## -2.1186 0.2316 1.0698 0.6495 -0.2699 -0.6634
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.13075 0.44537 -4.784 1.72e-06 ***
## dose 0.30663 0.06822 4.495 6.97e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 31.6755 on 5 degrees of freedom
## Residual deviance: 6.6214 on 4 degrees of freedom
## AIC: 27.415
##
## Number of Fisher Scoring iterations: 4
y <- c(0, 1, 0, 1, 0, 1, 0, 1, 0, 2, 1, 2, 0, 1, 1, 0, 1, 5, 1, 1)
x <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)
# Fit the linear model
summary(lm(y ~ x))
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3677 -0.6145 -0.2602 0.4297 3.4805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.15263 0.50312 0.303 0.7651
## x 0.07594 0.04200 1.808 0.0873 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.083 on 18 degrees of freedom
## Multiple R-squared: 0.1537, Adjusted R-squared: 0.1067
## F-statistic: 3.269 on 1 and 18 DF, p-value: 0.08733
# Fit the generalized linear model
summary(glm(y ~ x, family = "poisson"))
##
## Call:
## glm(formula = y ~ x, family = "poisson")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6389 -0.9726 -0.3115 0.5307 2.1559
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.04267 0.60513 -1.723 0.0849 .
## x 0.08360 0.04256 1.964 0.0495 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 23.589 on 19 degrees of freedom
## Residual deviance: 19.462 on 18 degrees of freedom
## AIC: 52.17
##
## Number of Fisher Scoring iterations: 5
# Often, we want to "look" at our data and trends in our data
# ggplot2 allows us to add trend lines to our data
# The defult lines are created using a technique called local regression
# However, we can specify different models, including GLMs
# During this exercise, we'll see how to plot a GLM
# Plot the data using jittered points and the default stat_smooth
ggplot(data = dfLong, aes(x = dose, y = mortality)) +
geom_jitter(height = 0.05, width = 0.1) +
stat_smooth(fill = 'pink', color = 'red')
## `geom_smooth()` using method = 'loess'
# Plot the data using jittered points and the the glm stat_smooth
ggplot(data = dfLong, aes(x = dose, y = mortality)) +
geom_jitter(height = 0.05, width = 0.1) +
stat_smooth(method = 'glm', method.args = list(family = "binomial"))
# library(lmerTest)
df <- data.frame(dose=rep(rep(c(0, 2, 4, 6, 8, 10), each=20), times=3),
mortality=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1),
replicate=factor(rep(letters[1:3], each=120))
)
str(df)
## 'data.frame': 360 obs. of 3 variables:
## $ dose : num 0 0 0 0 0 0 0 0 0 0 ...
## $ mortality: num 0 0 0 0 0 0 0 0 0 0 ...
## $ replicate: Factor w/ 3 levels "a","b","c": 1 1 1 1 1 1 1 1 1 1 ...
glmerOut <- lme4::glmer(mortality ~ dose + (1|replicate), family = 'binomial', data = df)
summary(glmerOut)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: mortality ~ dose + (1 | replicate)
## Data: df
##
## AIC BIC logLik deviance df.resid
## 378.1 389.8 -186.0 372.1 357
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.3484 -0.6875 -0.3031 0.6413 2.1907
##
## Random effects:
## Groups Name Variance Std.Dev.
## replicate (Intercept) 6.658e-15 8.16e-08
## Number of obs: 360, groups: replicate, 3
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.38736 0.27334 -8.734 <2e-16 ***
## dose 0.40948 0.04414 9.276 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## dose -0.884
# library(lmerTest)
# Fit the model and look at its summary
# modelOut <- lme4::glmer( cbind(Purchases, Pass) ~ friend + ranking + (1|city), data = allData, family = 'binomial')
# summary( modelOut)
# Compare outputs to a lmer model
# summary(lme4::lmer( Purchases/( Purchases + Pass) ~ friend + ranking + (1|city), data = allData))
# Run the code to see how to calculate odds ratios
# summary(modelOut)
# exp(fixef(modelOut)[2])
# exp(confint(modelOut)[3, ])
# Load lmerTest
# library(lmerTest)
userGroups <- data.frame(group=factor(rep(rep(LETTERS[1:4], each=10), times=2)),
webpage=factor(rep(c("old", "new"), each=40)),
clicks=c(0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 1, 1, 1, 2, 0, 1, 1, 0, 3, 2, 3, 1, 2, 4, 2, 1, 0, 2, 0, 1, 2, 0, 2, 1, 1, 2, 4, 2, 8, 1, 1, 1, 2, 1, 1, 0, 0, 3, 0, 1, 4, 1, 2, 0, 1, 1, 0, 0, 3, 2, 0, 3, 1, 2, 2, 0, 2, 3, 1, 3, 2, 4, 4, 2, 1, 5, 2)
)
str(userGroups)
## 'data.frame': 80 obs. of 3 variables:
## $ group : Factor w/ 4 levels "A","B","C","D": 1 1 1 1 1 1 1 1 1 1 ...
## $ webpage: Factor w/ 2 levels "new","old": 2 2 2 2 2 2 2 2 2 2 ...
## $ clicks : num 0 0 0 0 0 0 2 0 0 0 ...
# Fit a Poisson glmer
summary( lme4::glmer(clicks ~ webpage + (1|group), family = 'poisson', data = userGroups))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: clicks ~ webpage + (1 | group)
## Data: userGroups
##
## AIC BIC logLik deviance df.resid
## 255.5 262.6 -124.7 249.5 77
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.3999 -0.9104 -0.2340 0.4978 5.6126
##
## Random effects:
## Groups Name Variance Std.Dev.
## group (Intercept) 0.07093 0.2663
## Number of obs: 80, groups: group, 4
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.5524 0.1797 3.074 0.00211 **
## webpageold -0.5155 0.1920 -2.685 0.00726 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## webpageold -0.400
# library(lmerTest)
rawIL <- read.csv("./RInputFiles/ILData.csv")
ILdata <- rawIL
str(ILdata)
## 'data.frame': 1808 obs. of 4 variables:
## $ age : Factor w/ 4 levels "15_19","20_24",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 0 0 0 0 0 0 0 0 0 0 ...
## $ county: Factor w/ 47 levels "ALEXANDER","BROWN",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ count : int 0 0 0 5 0 7 0 4 0 12 ...
# Age goes before year
modelOut <- lme4::glmer(count ~ age + year + (year|county), family = 'poisson', data = ILdata)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 0.00144074 (tol =
## 0.001, component 1)
summary(modelOut)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: count ~ age + year + (year | county)
## Data: ILdata
##
## AIC BIC logLik deviance df.resid
## 3215.6 3259.6 -1599.8 3199.6 1800
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.4511 -0.0151 -0.0056 -0.0022 4.0053
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## county (Intercept) 129.9459 11.3994
## year 0.0648 0.2546 -1.00
## Number of obs: 1808, groups: county, 47
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.76258 2.13022 -5.052 4.36e-07 ***
## age20_24 -0.04152 0.03690 -1.125 0.261
## age25_29 -1.16262 0.05290 -21.976 < 2e-16 ***
## age30_34 -2.28278 0.08487 -26.898 < 2e-16 ***
## year 0.32708 0.25422 1.287 0.198
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) a20_24 a25_29 a30_34
## age20_24 -0.008
## age25_29 -0.006 0.341
## age30_34 -0.004 0.213 0.148
## year -0.764 0.000 0.000 0.000
## convergence code: 0
## Model failed to converge with max|grad| = 0.00144074 (tol = 0.001, component 1)
# Extract out fixed effects
lme4::fixef(modelOut)
## (Intercept) age20_24 age25_29 age30_34 year
## -10.76258497 -0.04151848 -1.16262225 -2.28277972 0.32708039
# Extract out random effects
lme4::ranef(modelOut)
## $county
## (Intercept) year
## ALEXANDER -0.2847724 0.006331741
## BROWN -0.2847724 0.006331741
## CALHOUN -0.2847724 0.006331741
## CARROLL 12.2418514 -0.260423999
## CASS -0.2847724 0.006331741
## CLARK 12.2137668 -0.268553354
## CLAY -0.2847724 0.006331741
## CRAWFORD 12.5037407 -0.265752695
## CUMBERLAND -0.2847724 0.006331741
## DE WITT 12.7456078 -0.277675211
## DOUGLAS 13.0751590 -0.306329903
## EDGAR 12.3642794 -0.283606045
## EDWARDS -0.2847724 0.006331741
## FAYETTE 12.8094530 -0.273060474
## FORD -0.2847724 0.006331741
## GALLATIN -0.2847724 0.006331741
## GREENE -0.2847724 0.006331741
## HAMILTON -0.2847724 0.006331741
## HANCOCK 12.8581265 -0.305650287
## HARDIN -0.2847724 0.006331741
## HENDERSON -0.2847724 0.006331741
## IROQUOIS 13.1616741 -0.311372907
## JASPER -0.2847724 0.006331741
## JERSEY 12.9202747 -0.272284048
## JO DAVIESS 12.7409389 -0.289747791
## JOHNSON -0.2847724 0.006331741
## LAWRENCE 12.3713561 -0.268571236
## MARSHALL -0.2847724 0.006331741
## MASON -0.2847724 0.006331741
## MENARD -0.2180916 0.004849989
## MERCER 12.7534193 -0.271678572
## MOULTRIE -0.2180916 0.004849989
## PIATT 12.5653132 -0.296687752
## PIKE 12.5310614 -0.259211299
## POPE -0.2180916 0.004849989
## PULASKI -0.2180916 0.004849989
## PUTNAM -0.2180916 0.004849989
## RICHLAND 12.0350865 -0.273928951
## SCHUYLER -0.2180916 0.004849989
## SCOTT -0.2180916 0.004849989
## SHELBY 12.5183293 -0.283472292
## STARK -0.2180916 0.004849989
## UNION 13.1465272 -0.308673332
## WABASH -0.2180916 0.004849989
## WASHINGTON -0.2180916 0.004849989
## WAYNE 12.1148896 -0.253234752
## WHITE -0.2180916 0.004849989
# Run code to see one method for plotting the data
ggplot(data = ILdata, aes(x = year, y = count, group = county)) +
geom_line() +
facet_grid(age ~ . ) +
stat_smooth( method = 'glm',
method.args = list( family = "poisson"), se = FALSE,
alpha = 0.5) +
theme_minimal()
Chapter 4 - Repeated Measures
An introduction to repeated measures:
Sleep study:
Hate in NY state?
Wrap up:
Example code includes:
y <- c(0.23, 2.735, -0.038, 6.327, -0.643, 1.69, -1.378, -1.228, -0.252, 2.014, -0.073, 6.101, 0.213, 3.127, -0.29, 8.395, -0.33, 2.735, 0.223, 1.301)
treat <- rep(c("before", "after"), times=10)
x <- rep(letters[1:10], each=2)
# Run a standard, non-paired t-test
t.test(y[treat == "before"], y[treat == "after"], paired = FALSE)
##
## Welch Two Sample t-test
##
## data: y[treat == "before"] and y[treat == "after"]
## t = -3.9043, df = 9.5409, p-value = 0.003215
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.594744 -1.512256
## sample estimates:
## mean of x mean of y
## -0.2338 3.3197
# Run a standard, paired t-test
t.test(y[treat == "before"], y[treat == "after"], paired = TRUE)
##
## Paired t-test
##
## data: y[treat == "before"] and y[treat == "after"]
## t = -4.2235, df = 9, p-value = 0.002228
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.456791 -1.650209
## sample estimates:
## mean of the differences
## -3.5535
library(lmerTest)
## Loading required package: lme4
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
library(lme4)
# Run the paired-test like before
t.test(y[treat == "before"], y[treat == "after"], paired = TRUE)
##
## Paired t-test
##
## data: y[treat == "before"] and y[treat == "after"]
## t = -4.2235, df = 9, p-value = 0.002228
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.456791 -1.650209
## sample estimates:
## mean of the differences
## -3.5535
# Run a repeated-measures ANOVA
anova(lmer( y ~ treat + (1|x)))
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## treat 63.137 63.137 1 8.9999 17.838 0.002228 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data(sleepstudy, package="lme4")
str(sleepstudy)
## 'data.frame': 180 obs. of 3 variables:
## $ Reaction: num 250 259 251 321 357 ...
## $ Days : num 0 1 2 3 4 5 6 7 8 9 ...
## $ Subject : Factor w/ 18 levels "308","309","310",..: 1 1 1 1 1 1 1 1 1 1 ...
# Plot the data
ggplot(data = sleepstudy) +
geom_line(aes(x = Days, y = Reaction, group = Subject)) +
stat_smooth(aes(x = Days, y = Reaction), method = 'lm', size = 3, se = FALSE)
# Build a lm
lm( Reaction ~ Days, data = sleepstudy)
##
## Call:
## lm(formula = Reaction ~ Days, data = sleepstudy)
##
## Coefficients:
## (Intercept) Days
## 251.41 10.47
# Build a lmer
(lmerOut <- lmer( Reaction ~ Days + (1|Subject), data = sleepstudy))
## Linear mixed model fit by REML ['lmerModLmerTest']
## Formula: Reaction ~ Days + (1 | Subject)
## Data: sleepstudy
## REML criterion at convergence: 1786.465
## Random effects:
## Groups Name Std.Dev.
## Subject (Intercept) 37.12
## Residual 30.99
## Number of obs: 180, groups: Subject, 18
## Fixed Effects:
## (Intercept) Days
## 251.41 10.47
# The lmer model you built during the previous exercise has been saved as lmerOut
# During this exercise, you will examine the effects of drug type using both an ANOVA framework and a regression framework
# Run an anova
anova(lmerOut)
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## Days 162703 162703 1 161 169.4 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Look at the regression coefficients
summary(lmerOut)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Reaction ~ Days + (1 | Subject)
## Data: sleepstudy
##
## REML criterion at convergence: 1786.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.2257 -0.5529 0.0109 0.5188 4.2506
##
## Random effects:
## Groups Name Variance Std.Dev.
## Subject (Intercept) 1378.2 37.12
## Residual 960.5 30.99
## Number of obs: 180, groups: Subject, 18
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 251.4051 9.7467 22.8102 25.79 <2e-16 ***
## Days 10.4673 0.8042 161.0000 13.02 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Days -0.371
# Read in NY hate data
rawHate <- read.csv("./RInputFiles/hateNY.csv")
hate <- rawHate
str(hate)
## 'data.frame': 233 obs. of 4 variables:
## $ Year : int 2010 2011 2012 2013 2014 2015 2016 2013 2010 2011 ...
## $ County : Factor w/ 59 levels "Albany","Allegany",..: 1 1 1 1 1 1 1 2 3 3 ...
## $ TotalIncidents: int 13 7 5 3 3 3 3 1 22 11 ...
## $ Year2 : int 0 1 2 3 4 5 6 3 0 1 ...
ggplot( data = hate, aes(x = Year, y = TotalIncidents, group = County)) +
geom_line() +
geom_smooth(method = 'lm', se = FALSE)
# During this exercise, you will build a glmer
# Because most of the incidents are small count values, use a Poisson (R function family poisson) error term
# First, build a model using the actually year (variable Year, such as 2006, 2007, etc) - this model will fail
# Second, build a model using the rescaled year (variable Year2, such as 0, 1, 2, etc)
# This demonstrates the importance of considering where the intercept is located when building regression models
# Recall that a variable x can be both a fixed and random effect in a lmer() or glmer(): for example lmer(y ~ x + (x| group) demonstrates this syntax
# glmer with raw Year
glmer(TotalIncidents ~ Year + (Year|County), data = hate, family = "poisson")
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.370207 (tol = 0.001, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: TotalIncidents ~ Year + (Year | County)
## Data: hate
## AIC BIC logLik deviance df.resid
## 1165.2746 1182.5298 -577.6373 1155.2746 228
## Random effects:
## Groups Name Std.Dev. Corr
## County (Intercept) 217.8915
## Year 0.1084 -1.00
## Number of obs: 233, groups: County, 59
## Fixed Effects:
## (Intercept) Year
## 295.4814 -0.1464
## convergence code 0; 3 optimizer warnings; 0 lme4 warnings
# glmer with scaled Year, Year2
glmerOut <- glmer(TotalIncidents ~ Year2 + (Year2|County), data = hate, family = "poisson")
summary(glmerOut)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: TotalIncidents ~ Year2 + (Year2 | County)
## Data: hate
##
## AIC BIC logLik deviance df.resid
## 1165.3 1182.5 -577.6 1155.3 228
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.5434 -0.4864 -0.1562 0.3319 3.1939
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## County (Intercept) 1.16291 1.0784
## Year2 0.01175 0.1084 0.02
## Number of obs: 233, groups: County, 59
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.27952 0.16600 7.708 1.28e-14 ***
## Year2 -0.14622 0.03324 -4.398 1.09e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Year2 -0.338
# Extract and manipulate data
countyTrend <- ranef(glmerOut)$County
countyTrend$county <- factor(row.names(countyTrend), levels =row.names(countyTrend)[order(countyTrend$Year2)])
# Plot results
ggplot(data = countyTrend, aes(x = county, y = Year2)) + geom_point() +
coord_flip() +
ylab("Change in hate crimes per year") +
xlab("County")
Chapter 1 - Forecasting Demand with Time Series
Loading data in to an xts object:
ARIMA Time Series 101:
Forecasting and Evaluating:
Example code includes:
# Read in beverages data
rawBev <- read.csv("./RInputFiles/Bev.csv")
bev <- rawBev
str(bev)
## 'data.frame': 176 obs. of 14 variables:
## $ M.hi.p : num 59.2 56.3 56.3 49.3 61.3 ...
## $ M.lo.p : num 29.2 26.3 26.2 26.1 25.9 ...
## $ MET.hi.p: num 63.7 60.3 60.8 55.1 65.1 ...
## $ MET.lo.p: num 26 25.5 25.7 26.5 25.7 ...
## $ MET.sp.p: num 50.1 48.8 48.6 47.7 50.8 ...
## $ SEC.hi.p: num 58.6 54.6 57.9 49.7 63.7 ...
## $ SEC.lo.p: num 29.2 26.3 26.2 26.1 25.9 ...
## $ M.hi : int 458 477 539 687 389 399 392 417 568 583 ...
## $ M.lo : int 1455 1756 2296 3240 2252 1901 1939 1999 1798 1558 ...
## $ MET.hi : int 2037 1700 1747 2371 1741 2072 2353 2909 3204 2395 ...
## $ MET.lo : int 3437 3436 3304 3864 3406 3418 3553 3376 3233 3262 ...
## $ MET.sp : int 468 464 490 657 439 453 423 408 501 481 ...
## $ SEC.hi : int 156 151 178 217 141 149 134 148 195 170 ...
## $ SEC.lo : int 544 624 611 646 624 610 623 599 551 539 ...
# Load xts package
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## fitted.fracdiff fracdiff
## residuals.fracdiff fracdiff
# Create the dates object as an index for your xts object
dates <- seq(as.Date("2014-01-19"), length = 176, by = "weeks")
# Create an xts object called bev_xts
bev_xts <- xts(bev, order.by = dates)
# Create the individual region sales as their own objects
MET_hi <- bev_xts[,"MET.hi"]
MET_lo <- bev_xts[,"MET.lo"]
MET_sp <- bev_xts[,"MET.sp"]
# Sum the region sales together
MET_t <- MET_hi + MET_lo + MET_sp
# Plot the metropolitan region total sales
plot(MET_t)
# Split the data into training and validation
MET_t_train <- MET_t[index(MET_t) < "2017-01-01"]
MET_t_valid <- MET_t[index(MET_t) >= "2017-01-01"]
# Use auto.arima() function for metropolitan sales
MET_t_model <- auto.arima(MET_t_train)
# Forecast the first 22 weeks of 2017
forecast_MET_t <- forecast(MET_t_model, h = 22)
# Plot this forecast #
plot(forecast_MET_t)
# Convert to numeric for ease
for_MET_t <- as.numeric(forecast_MET_t$mean)
v_MET_t <- as.numeric(MET_t_valid)
# Calculate the MAE
MAE <- mean(abs(for_MET_t - v_MET_t))
# Calculate the MAPE
MAPE <- 100*mean(abs(for_MET_t - v_MET_t)/v_MET_t)
# Print to see how good your forecast is!
print(MAE)
## [1] 898.8403
print(MAPE)
## [1] 17.10332
# Convert your forecast to an xts object
for_dates <- seq(as.Date("2017-01-01"), length = 22, by = "weeks")
for_MET_t_xts <- xts(forecast_MET_t$mean, order.by = for_dates)
# Plot the validation data set
plot(for_MET_t_xts, main = 'Forecast Comparison', ylim = c(4000, 8500))
# Overlay the forecast of 2017
lines(MET_t_valid, col = "blue")
# Plot the validation data set
plot(MET_t_valid, main = 'Forecast Comparison', ylim = c(4000, 8500))
# Overlay the forecast of 2017
lines(for_MET_t_xts, col = "blue")
# Convert the limits to xts objects
lower <- xts(forecast_MET_t$lower[, 2], order.by = for_dates)
upper <- xts(forecast_MET_t$upper[, 2], order.by = for_dates)
# Adding confidence intervals of forecast to plot
lines(lower, col = "blue", lty = "dashed")
lines(upper, col = "blue", lty = "dashed")
Chapter 2 - Components of Demand
Price elasticity:
Seasonal/holiday/promotional effects:
Forecasting with regression:
Example code includes:
bev_xts_train <- bev_xts[index(bev_xts) < "2017-01-01"]
bev_xts_valid <- bev_xts[index(bev_xts) >= "2017-01-01"]
# Save the prices of each product
l_MET_hi_p <- log(as.vector(bev_xts_train[, "MET.hi.p"]))
# Save as a data frame
MET_hi_train <- data.frame(as.vector(log(MET_hi[index(MET_hi) < "2017-01-01"])), l_MET_hi_p)
colnames(MET_hi_train) <- c("log_sales", "log_price")
# Calculate the regression
model_MET_hi <- lm(log_sales ~ log_price, data = MET_hi_train)
# Plot the product's sales
plot(MET_hi)
# Plot the product's price
MET_hi_p <- bev_xts_train[, "MET.hi.p"]
plot(MET_hi_p)
# Create date indices for New Year's week
n.dates <- as.Date(c("2014-12-28", "2015-12-27", "2016-12-25"))
# Create xts objects for New Year's
newyear <- as.xts(rep(1, 3), order.by = n.dates)
# Create sequence of dates for merging
dates_train <- seq(as.Date("2014-01-19"), length = 154, by = "weeks")
# Merge training dates into New Year's object
newyear <- merge(newyear, dates_train, fill = 0)
# Add newyear variable to your data frame
MET_hi_train <- data.frame(MET_hi_train, newyear=as.vector(newyear))
# Build regressions for the product
model_MET_hi_full <- lm(log_sales ~ log_price + newyear, data = MET_hi_train)
# Subset the validation prices #
l_MET_hi_p_valid <- log(as.vector(bev_xts_valid[, "MET.hi.p"]))
# Create a validation data frame #
MET_hi_valid <- data.frame(l_MET_hi_p_valid)
colnames(MET_hi_valid) <- "log_price"
# Predict the log of sales for your high end product
pred_MET_hi <- predict(model_MET_hi, MET_hi_valid)
# Convert predictions out of log scale
pred_MET_hi <- exp(pred_MET_hi)
# Convert to an xts object
dates_valid <- seq(as.Date("2017-01-01"), length = 22, by = "weeks")
pred_MET_hi_xts <- xts(pred_MET_hi, order.by = dates_valid)
# Plot the forecast
plot(pred_MET_hi_xts)
# Calculate and print the MAPE
MET_hi_v <- bev_xts_valid[,"MET.hi"]
MAPE <- 100*mean(abs((pred_MET_hi_xts - MET_hi_v)/MET_hi_v))
print(MAPE)
## [1] 29.57455
Chapter 3 - Blending Regression with Time Series
Residuals from regression model:
Forecasting residuals:
Transfer functions and ensembling:
Example code includes:
# Calculate the residuals from the model
MET_hi_full_res <- resid(model_MET_hi_full)
# Convert the residuals to an xts object
MET_hi_full_res <- xts(MET_hi_full_res, order.by = dates_train)
# Plot the histogram of the residuals
hist(MET_hi_full_res)
# Plot the residuals over time
plot(MET_hi_full_res)
# Build an ARIMA model on the residuals: MET_hi_arima
MET_hi_arima <- auto.arima(MET_hi_full_res)
# Look at a summary of the model
summary(MET_hi_arima)
## Series: MET_hi_full_res
## ARIMA(3,0,1) with zero mean
##
## Coefficients:
## ar1 ar2 ar3 ma1
## 1.2150 -0.1758 -0.2945 -0.4675
## s.e. 0.1578 0.1885 0.0901 0.1574
##
## sigma^2 estimated as 0.03905: log likelihood=32.51
## AIC=-55.02 AICc=-54.61 BIC=-39.83
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0004563225 0.1950202 0.1456822 -46.5269 224.0129 0.5251787
## ACF1
## Training set 0.001241359
# Forecast 22 weeks with your model: for_MET_hi_arima
for_MET_hi_arima <- forecast(MET_hi_arima, h=22)
# Print first 10 observations
head(for_MET_hi_arima$mean, n = 10)
## Time Series:
## Start = 1079
## End = 1142
## Frequency = 0.142857142857143
## [1] -0.05952097 -0.12319548 -0.10839523 -0.09251312 -0.05706575 -0.02114739
## [7] 0.01158452 0.03459993 0.04623106 0.04667681
# Convert your forecasts into an xts object
dates_valid <- seq(as.Date("2017-01-01"), length = 22, by = "weeks")
for_MET_hi_arima <- xts(for_MET_hi_arima$mean, order.by = dates_valid)
# Plot the forecast
plot(for_MET_hi_arima)
# Convert your residual forecast to the exponential version
for_MET_hi_arima <- exp(for_MET_hi_arima)
# Multiply your forecasts together!
for_MET_hi_final <- for_MET_hi_arima * pred_MET_hi_xts
# Plot the final forecast - don't touch the options!
plot(for_MET_hi_final, ylim = c(1000, 4300))
# Overlay the validation data set
lines(MET_hi_v, col = "blue")
# Calculate the MAE
MAE <- mean(abs(for_MET_hi_final - MET_hi_v))
print(MAE)
## [1] 474.7013
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_MET_hi_final - MET_hi_v)/MET_hi_v)
print(MAPE)
## [1] 28.44671
# Build an ARIMA model using the auto.arima function
MET_hi_model_arima <- auto.arima(MET_hi)
# Forecast the ARIMA model
for_MET_hi <- forecast(MET_hi_model_arima, h = length(MET_hi_v))
# Save the forecast as an xts object
dates_valid <- seq(as.Date("2017-01-01"), length = 22, by = "weeks")
for_MET_hi_xts <- xts(for_MET_hi$mean, order.by = dates_valid)
# Calculate the MAPE of the forecast
MAPE <- 100 * mean(abs(for_MET_hi_xts - MET_hi_v) / MET_hi_v)
print(MAPE)
## [1] 36.95411
# Ensemble the two forecasts together
for_MET_hi_en <- 0.5 * (for_MET_hi_xts + pred_MET_hi_xts)
# Calculate the MAE and MAPE
MAE <- mean(abs(for_MET_hi_en - MET_hi_v))
print(MAE)
## [1] 533.8911
MAPE <- 100 * mean(abs(for_MET_hi_en - MET_hi_v) / MET_hi_v)
print(MAPE)
## [1] 32.28549
Chapter 4 - Hierarchical Forecasting
Bottom-Up Hierarchical Forecasting:
Top-Down Hierarchical Forecasting:
Middle-Out Hierarchical Forecasting:
Wrap up:
Example code includes:
# Build a time series model
MET_sp_model_arima <- auto.arima(MET_sp)
# Forecast the time series model for 22 periods
for_MET_sp <- forecast(MET_sp_model_arima, h=22)
# Create an xts object
for_MET_sp_xts <- xts(for_MET_sp$mean, order.by=dates_valid)
MET_sp_v <- MET_sp["2017"]
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_MET_sp_xts - MET_sp_v) / MET_sp_v)
print(MAPE)
## [1] 6.772393
MET_sp_train <- bev_xts_train %>%
transform(log_sales = log(MET.sp), log_price=log(MET.sp.p))
MET_sp_train <- MET_sp_train[, c("log_sales", "log_price")]
MET_sp_train$newyear <- 0
MET_sp_train$valentine <- 0
MET_sp_train$christmas <- 0
MET_sp_train$mother <- 0
MET_sp_train[index(MET_sp_train) %in% as.Date(c("2014-12-28", "2015-12-27", "2016-12-25")), "newyear"] <- 1
MET_sp_train[index(MET_sp_train) %in% as.Date(c("2014-02-09", "2015-02-08", "2016-02-07")), "valentine"] <- 1
MET_sp_train[index(MET_sp_train) %in% as.Date(c("2014-12-21", "2015-12-20", "2016-12-18")), "christmas"] <- 1
MET_sp_train[index(MET_sp_train) %in% as.Date(c("2014-05-04", "2015-05-03", "2016-05-01")), "mother"] <- 1
# THE BELOW IS TOTAL NONSENSE
# Build a regression model
model_MET_sp <- lm(log_sales ~ log_price + newyear + valentine + christmas + mother, data = MET_sp_train)
MET_sp_valid <- as.data.frame(bev_xts_valid) %>%
mutate(log_sales = log(MET.sp), log_price=log(MET.sp.p)) %>%
select("log_sales", "log_price")
MET_sp_valid$newyear <- 0
MET_sp_valid$valentine <- 0
MET_sp_valid$christmas <- 0
MET_sp_valid$mother <- 0
MET_sp_valid[7, "valentine"] <- 1
MET_sp_valid[19, "mother"] <- 1
MET_sp_valid$log_sales <- NULL
# Forecast the regression model using the predict function
pred_MET_sp <- predict(model_MET_sp, MET_sp_valid)
# Exponentiate your predictions and create an xts object
pred_MET_sp <- exp(pred_MET_sp)
pred_MET_sp_xts <- xts(pred_MET_sp, order.by = dates_valid)
# Calculate MAPE
MAPE <- 100*mean(abs((pred_MET_sp_xts - MET_sp_v)/MET_sp_v))
print(MAPE)
## [1] 6.55473
# Ensemble the two forecasts
for_MET_sp_en <- 0.5 * (for_MET_sp_xts + pred_MET_sp_xts)
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_MET_sp_en - MET_sp_v) / MET_sp_v)
print(MAPE)
## [1] 6.08091
# Copy over pred_MET_lo_xts
pred_MET_lo_xts <- xts(c(2960.6, 2974.1, 2943.2, 2948.6, 2915.6, 2736.4, 2953.9, 3199.4, 2934, 2898.7, 3027.7, 3165.9, 3073.1, 2842.7, 2928.7, 3070.2, 2982.2, 3018, 3031.9, 2879.4, 2993.2, 2974.1), order.by=dates_valid)
# Calculate the metropolitan regional sales forecast
for_MET_total <- pred_MET_hi_xts + for_MET_sp_en + pred_MET_lo_xts
# Calculate a validation data set
MET_t_v <- bev_xts_valid[,"MET.hi"] + bev_xts_valid[,"MET.lo"] + bev_xts_valid[,"MET.sp"]
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_MET_total - MET_t_v) / MET_t_v)
print(MAPE)
## [1] 10.61441
# Create the MET_total data
MET_total <- xts(data.frame(MET.hi=c(5942, 5600, 5541, 6892, 5586, 5943, 6329, 6693, 6938, 6138, 6361, 6378, 5423, 5097, 4937, 5496, 6870, 6626, 6356, 5657, 6577, 7202, 7381, 7404, 7204, 6667, 6153, 6035, 5633, 5283, 5178, 4758, 5058, 5254, 5954, 6166, 6247, 6304, 7202, 6662, 6814, 6174, 5412, 5380, 5674, 6472, 6912, 7404, 8614, 8849, 7174, 6489, 7174, 6555, 6402, 7671, 5012, 4790, 5075, 5238, 5615, 6113, 7706, 7811, 7898, 7232, 6585, 5870, 7084, 5125, 5330, 5553, 6349, 6195, 6271, 5851, 5333, 5854, 5609, 5649, 6051, 6409, 5786, 5190, 5085, 4949, 5151, 5147, 5426, 5509, 6956, 7870, 8224, 6685, 6153, 5802, 5244, 5162, 5036, 5025, 8378, 8944, 7109, 7605, 7846, 7598, 8012, 9551, 6102, 5366, 4932, 4962, 5392, 6194, 7239, 7621, 7460, 7097, 6596, 5848, 8306, 5344, 5848, 6341, 7364, 7269, 7053, 6682, 6971, 7521, 7063, 6298, 6003, 5227, 5047, 4877, 4851, 4628, 4516, 4442, 4935, 5181, 5431, 5866, 5919, 5704, 5957, 6019, 5962, 6021, 5880, 5674, 7439, 7415)),
order.by=dates_train
)
# Build a regional time series model
MET_t_model_arima <- auto.arima(MET_total)
# Calculate a 2017 forecast for 22 periods
for_MET_t <- forecast(MET_t_model_arima, h=22)
# Make an xts object from your forecast
for_MET_t_xts <- xts(for_MET_t$mean, order.by=dates_valid)
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_MET_t_xts - MET_t_v) / MET_t_v)
print(MAPE)
## [1] 17.10332
# Calculate the average historical proportions
prop_hi <- mean(MET_hi/MET_total)
prop_lo <- mean(MET_lo/MET_total)
prop_sp <- mean(MET_sp/MET_total)
# Distribute out your forecast to each product
for_prop_hi <- prop_hi*for_MET_t_xts
for_prop_lo <- prop_lo*for_MET_t_xts
for_prop_sp <- prop_sp*for_MET_t_xts
# Calculate the MAPE's for each product
MAPE_hi <- 100 * mean(abs(for_prop_hi - MET_hi_v) / MET_hi_v)
print(MAPE_hi)
## [1] 38.7318
MET_lo_v <- bev_xts_valid[,"MET.lo"]
MAPE_lo <- 100 * mean(abs(for_prop_lo - MET_lo_v) / MET_lo_v)
print(MAPE_lo)
## [1] 10.70649
MAPE_sp <- 100 * mean(abs(for_prop_sp - MET_sp_v) / MET_sp_v)
print(MAPE_sp)
## [1] 6.232888
# Calculate the average historical proportions
prop_hi_2 <- mean(MET_hi) / mean(MET_total)
prop_lo_2 <- mean(MET_lo) / mean(MET_total)
prop_sp_2 <- mean(MET_sp) / mean(MET_total)
# Distribute out your forecast to each product
for_prop_hi_2 <- prop_hi_2 * for_MET_t_xts
for_prop_lo_2 <- prop_lo_2 * for_MET_t_xts
for_prop_sp_2 <- prop_sp_2 * for_MET_t_xts
# Calculate the MAPE's for each product
MAPE_hi <- 100 * mean(abs(for_prop_hi_2 - MET_hi_v) / MET_hi_v)
print(MAPE_hi)
## [1] 38.33559
MAPE_lo <- 100 * mean(abs(for_prop_lo_2 - MET_lo_v) / MET_lo_v)
print(MAPE_lo)
## [1] 8.450784
MAPE_sp <- 100 * mean(abs(for_prop_sp_2 - MET_sp_v) / MET_sp_v)
print(MAPE_sp)
## [1] 6.517045
SEC_total <- xts(data.frame(SEC.hi=c(700, 775, 789, 863, 765, 759, 757, 747, 746, 709, 749, 786, 796, 726, 727, 723, 778, 755, 739, 740, 723, 695, 727, 707, 725, 684, 667, 698, 727, 722, 748, 695, 742, 739, 715, 724, 686, 671, 688, 682, 710, 700, 672, 680, 695, 780, 751, 693, 809, 881, 703, 712, 768, 796, 808, 904, 641, 662, 693, 725, 719, 736, 715, 722, 732, 745, 689, 705, 811, 739, 744, 700, 745, 735, 732, 722, 721, 732, 750, 714, 752, 677, 731, 674, 720, 675, 741, 722, 715, 719, 649, 697, 743, 733, 772, 698, 690, 734, 713, 644, 788, 833, 749, 731, 670, 675, 675, 993, 773, 751, 697, 677, 750, 723, 780, 763, 721, 701, 704, 684, 985, 791, 731, 714, 704, 694, 685, 652, 708, 754, 747, 705, 711, 699, 712, 745, 706, 665, 666, 692, 676, 696, 689, 697, 689, 717, 697, 708, 660, 707, 715, 680, 922, 888)), order.by=dates_train
)
# Build a time series model for the region
SEC_t_model_arima <- auto.arima(SEC_total)
# Forecast the time series model
for_SEC_t <- forecast(SEC_t_model_arima, h=22)
# Make into an xts object
for_SEC_t_xts <- xts(for_SEC_t$mean, order.by=dates_valid)
SEC_t_v <- bev_xts_valid$SEC.hi + bev_xts_valid$SEC.lo
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_SEC_t_xts - SEC_t_v) / SEC_t_v)
print(MAPE)
## [1] 4.742324
SEC_hi <- bev_xts_train[, "SEC.hi"]
SEC_lo <- bev_xts_train[, "SEC.lo"]
SEC_hi_v <- bev_xts_valid[, "SEC.hi"]
SEC_lo_v <- bev_xts_valid[, "SEC.lo"]
# Calculate the average of historical proportions
prop_hi <- mean(SEC_hi / SEC_total)
prop_lo <- mean(SEC_lo / SEC_total)
# Distribute the forecast
for_prop_hi <- prop_hi * for_SEC_t_xts
for_prop_lo <- prop_lo * for_SEC_t_xts
# Calculate a MAPE for each product
MAPE_hi <- 100 * mean(abs(for_prop_hi - SEC_hi_v) / SEC_hi_v)
print(MAPE_hi)
## [1] 7.988508
MAPE_lo <- 100 * mean(abs(for_prop_lo - SEC_lo_v) / SEC_lo_v)
print(MAPE_lo)
## [1] 5.202529
# Copy over for_M_t_xts data
for_M_t_xts <- xts(c(2207, 2021, 2010, 2052, 2075, 2074, 2065, 2058, 2056, 2055, 2053, 2052, 2050, 2049, 2048, 2047, 2046, 2045, 2044, 2043, 2043, 2042), order.by=dates_valid)
# Calculate the state sales forecast: for_state
for_state = for_SEC_t_xts + for_MET_t_xts + for_M_t_xts
# See the forecasts
for_state
## [,1]
## 2017-01-01 9996.689
## 2017-01-08 9525.915
## 2017-01-15 9342.760
## 2017-01-22 9269.321
## 2017-01-29 9214.912
## 2017-02-05 9162.005
## 2017-02-12 9118.199
## 2017-02-19 9087.859
## 2017-02-26 9070.209
## 2017-03-05 9058.715
## 2017-03-12 9049.677
## 2017-03-19 9043.959
## 2017-03-26 9038.794
## 2017-04-02 9035.673
## 2017-04-09 9033.250
## 2017-04-16 9031.296
## 2017-04-23 9029.656
## 2017-04-30 9028.227
## 2017-05-07 9026.939
## 2017-05-14 9025.746
## 2017-05-21 9025.617
## 2017-05-28 9024.530
Chapter 1 - Identifying the Best Recruiting Source
Introduction - Ben Teusch, HR Analytics Consultant:
Recruiting and quality of hire:
Visualizing recruiting data:
Example code includes:
# Import the recruitment data
recruitment <- readr::read_csv("./RInputFiles/recruitment_data.csv")
## Parsed with column specification:
## cols(
## attrition = col_double(),
## performance_rating = col_double(),
## sales_quota_pct = col_double(),
## recruiting_source = col_character()
## )
# Look at the first few rows of the dataset
head(recruitment)
## # A tibble: 6 x 4
## attrition performance_rating sales_quota_pct recruiting_source
## <dbl> <dbl> <dbl> <chr>
## 1 1 3 1.09 Applied Online
## 2 0 3 2.39 <NA>
## 3 1 2 0.498 Campus
## 4 0 2 2.51 <NA>
## 5 0 3 1.42 Applied Online
## 6 1 3 0.548 Referral
# Get an overview of the recruitment data
summary(recruitment)
## attrition performance_rating sales_quota_pct recruiting_source
## Min. :0.000 Min. :1.000 Min. :-0.7108 Length:446
## 1st Qu.:0.000 1st Qu.:2.000 1st Qu.: 0.5844 Class :character
## Median :0.000 Median :3.000 Median : 1.0701 Mode :character
## Mean :0.213 Mean :2.895 Mean : 1.0826
## 3rd Qu.:0.000 3rd Qu.:3.000 3rd Qu.: 1.5325
## Max. :1.000 Max. :5.000 Max. : 3.6667
# See which recruiting sources the company has been using
recruitment %>%
count(recruiting_source)
## # A tibble: 5 x 2
## recruiting_source n
## <chr> <int>
## 1 Applied Online 130
## 2 Campus 56
## 3 Referral 45
## 4 Search Firm 10
## 5 <NA> 205
# Find the average sales quota attainment for each recruiting source
avg_sales <- recruitment %>%
group_by(recruiting_source) %>%
summarize(avg_sales_quota_pct=mean(sales_quota_pct))
# Display the result
avg_sales
## # A tibble: 5 x 2
## recruiting_source avg_sales_quota_pct
## <chr> <dbl>
## 1 Applied Online 1.06
## 2 Campus 0.908
## 3 Referral 1.02
## 4 Search Firm 0.887
## 5 <NA> 1.17
# Find the average attrition for the sales team, by recruiting source, sorted from lowest attrition rate to highest
avg_attrition <- recruitment %>%
group_by(recruiting_source) %>%
summarize(attrition_rate=mean(attrition)) %>%
arrange(attrition_rate)
# Display the result
avg_attrition
## # A tibble: 5 x 2
## recruiting_source attrition_rate
## <chr> <dbl>
## 1 <NA> 0.132
## 2 Applied Online 0.246
## 3 Campus 0.286
## 4 Referral 0.333
## 5 Search Firm 0.5
# Plot the bar chart
avg_sales %>% ggplot(aes(x=recruiting_source, y=avg_sales_quota_pct)) + geom_col()
# Plot the bar chart
avg_attrition %>% ggplot(aes(x=recruiting_source, y=attrition_rate)) + geom_col()
Chapter 2 - What is driving low employee engagement
Analyzing employee engagement:
Visualizing the engagement data:
Are differences meaningful?
Example code includes:
# Import the data
survey <- readr::read_csv("./RInputFiles/survey_data.csv")
## Parsed with column specification:
## cols(
## employee_id = col_double(),
## department = col_character(),
## engagement = col_double(),
## salary = col_double(),
## vacation_days_taken = col_double()
## )
# Get an overview of the data
summary(survey)
## employee_id department engagement salary
## Min. : 1.0 Length:1470 Min. :1.00 Min. : 45530
## 1st Qu.: 491.2 Class :character 1st Qu.:3.00 1st Qu.: 59407
## Median :1020.5 Mode :character Median :3.00 Median : 70481
## Mean :1024.9 Mean :3.05 Mean : 74162
## 3rd Qu.:1555.8 3rd Qu.:4.00 3rd Qu.: 84763
## Max. :2068.0 Max. :5.00 Max. :164073
## vacation_days_taken
## Min. : 0.00
## 1st Qu.: 6.00
## Median :10.00
## Mean :11.27
## 3rd Qu.:16.00
## Max. :38.00
# Examine the counts of the department variable
survey %>% count(department)
## # A tibble: 3 x 2
## department n
## <chr> <int>
## 1 Engineering 961
## 2 Finance 63
## 3 Sales 446
# Output the average engagement score for each department, sorted
survey %>%
group_by(department) %>%
summarize(avg_engagement=mean(engagement)) %>%
arrange(avg_engagement)
## # A tibble: 3 x 2
## department avg_engagement
## <chr> <dbl>
## 1 Sales 2.81
## 2 Engineering 3.15
## 3 Finance 3.24
# Create the disengaged variable and assign the result to survey
survey_disengaged <- survey %>%
mutate(disengaged = ifelse(engagement <= 2, 1, 0))
survey_disengaged
## # A tibble: 1,470 x 6
## employee_id department engagement salary vacation_days_taken disengaged
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 Sales 3 103264. 7 0
## 2 2 Engineering 3 80709. 12 0
## 3 4 Engineering 3 60737. 12 0
## 4 5 Engineering 3 99116. 7 0
## 5 7 Engineering 3 51022. 18 0
## 6 8 Engineering 3 98400. 9 0
## 7 10 Engineering 3 57106. 18 0
## 8 11 Engineering 1 55065. 4 1
## 9 12 Engineering 4 77158. 12 0
## 10 13 Engineering 2 48365. 14 1
## # ... with 1,460 more rows
# Summarize the three variables by department
survey_summary <- survey_disengaged %>%
group_by(department) %>%
summarize(pct_disengaged=mean(disengaged),
avg_salary=mean(salary),
avg_vacation_taken=mean(vacation_days_taken)
)
survey_summary
## # A tibble: 3 x 4
## department pct_disengaged avg_salary avg_vacation_taken
## <chr> <dbl> <dbl> <dbl>
## 1 Engineering 0.206 73576. 12.2
## 2 Finance 0.190 76652. 11.5
## 3 Sales 0.330 75074. 9.22
# Gather data for plotting
survey_gathered <- survey_summary %>%
gather(key = "measure", value = "value",
pct_disengaged, avg_salary, avg_vacation_taken)
# Create three bar charts
ggplot(survey_gathered, aes(x=measure, y=value, fill=department)) +
geom_col(position="dodge") +
facet_wrap(~ measure, scales="free")
# Add the in_sales variable
survey_sales <- survey %>%
mutate(in_sales = ifelse(department == "Sales", "Sales", "Other"),
disengaged = ifelse(engagement < 3, 1L, 0L)
)
# Test the hypothesis using survey_sales
chisq.test(survey_sales$disengaged, survey_sales$in_sales)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: survey_sales$disengaged and survey_sales$in_sales
## X-squared = 25.524, df = 1, p-value = 4.368e-07
t.test(disengaged ~ in_sales, data=survey_sales)
##
## Welch Two Sample t-test
##
## data: disengaged by in_sales
## t = -4.862, df = 743.16, p-value = 1.419e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.17479596 -0.07424062
## sample estimates:
## mean in group Other mean in group Sales
## 0.2050781 0.3295964
# Test the hypothesis using the survey_sales data
t.test(vacation_days_taken ~ in_sales, data = survey_sales)
##
## Welch Two Sample t-test
##
## data: vacation_days_taken by in_sales
## t = 8.1549, df = 1022.9, p-value = 1.016e-15
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.229473 3.642409
## sample estimates:
## mean in group Other mean in group Sales
## 12.160156 9.224215
Chapter 3 - Are new hires getting paid too much?
Paying new hires fairly:
Omitted variable bias:
Linear regression helps to test the multivariate impacts of variables:
Example code includes:
# Import the data
pay <- readr::read_csv("./RInputFiles/fair_pay_data.csv")
## Parsed with column specification:
## cols(
## employee_id = col_double(),
## department = col_character(),
## salary = col_double(),
## new_hire = col_character(),
## job_level = col_character()
## )
# Get an overview of the data
summary(pay)
## employee_id department salary new_hire
## Min. : 1.0 Length:1470 Min. : 43820 Length:1470
## 1st Qu.: 491.2 Class :character 1st Qu.: 59378 Class :character
## Median :1020.5 Mode :character Median : 70425 Mode :character
## Mean :1024.9 Mean : 74142
## 3rd Qu.:1555.8 3rd Qu.: 84809
## Max. :2068.0 Max. :164073
## job_level
## Length:1470
## Class :character
## Mode :character
##
##
##
# Check average salary of new hires and non-new hires
pay %>%
group_by(new_hire) %>%
summarize(avg_salary=mean(salary))
## # A tibble: 2 x 2
## new_hire avg_salary
## <chr> <dbl>
## 1 No 73425.
## 2 Yes 76074.
# Perform the correct statistical test
t.test(salary ~ new_hire, data = pay)
##
## Welch Two Sample t-test
##
## data: salary by new_hire
## t = -2.3437, df = 685.16, p-value = 0.01938
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4869.4242 -429.9199
## sample estimates:
## mean in group No mean in group Yes
## 73424.60 76074.28
t.test(salary ~ new_hire, data = pay) %>%
broom::tidy()
## # A tibble: 1 x 10
## estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -2650. 73425. 76074. -2.34 0.0194 685. -4869. -430.
## # ... with 2 more variables: method <chr>, alternative <chr>
# Create a stacked bar chart
pay %>%
ggplot(aes(x=new_hire, fill=job_level)) +
geom_bar(position="fill")
# Calculate the average salary for each group of interest
pay_grouped <- pay %>%
group_by(new_hire, job_level) %>%
summarize(avg_salary = mean(salary))
# Graph the results using facet_wrap()
pay_grouped %>%
ggplot(aes(x=new_hire, y=avg_salary)) +
geom_col() +
facet_wrap(~ job_level)
# Filter the data to include only hourly employees
pay_filter <- pay %>%
filter(job_level == "Hourly")
# Test the difference in pay
t.test(salary ~ new_hire, data=pay_filter) %>%
broom::tidy()
## # A tibble: 1 x 10
## estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -1107. 63966. 65073. -1.75 0.0807 500. -2349. 136.
## # ... with 2 more variables: method <chr>, alternative <chr>
# Run the simple regression
model_simple <- lm(salary ~ new_hire, data = pay)
# Display the summary of model_simple
model_simple %>%
summary()
##
## Call:
## lm(formula = salary ~ new_hire, data = pay)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32255 -14466 -3681 10740 87998
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73424.6 577.2 127.200 <2e-16 ***
## new_hireYes 2649.7 1109.4 2.388 0.017 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18900 on 1468 degrees of freedom
## Multiple R-squared: 0.003871, Adjusted R-squared: 0.003193
## F-statistic: 5.705 on 1 and 1468 DF, p-value: 0.01704
# Display a tidy summary
model_simple %>%
broom::tidy()
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 73425. 577. 127. 0
## 2 new_hireYes 2650. 1109. 2.39 0.0170
# Run the multiple regression
model_multiple <- lm(salary ~ new_hire + job_level, data = pay)
# Display the summary of model_multiple
model_multiple %>%
summary()
##
## Call:
## lm(formula = salary ~ new_hire + job_level, data = pay)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21013 -7091 -425 6771 44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64049.3 308.3 207.722 <2e-16 ***
## new_hireYes 782.7 524.8 1.491 0.136
## job_levelManager 54918.8 915.3 60.001 <2e-16 ***
## job_levelSalaried 26865.6 567.2 47.369 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8930 on 1466 degrees of freedom
## Multiple R-squared: 0.7779, Adjusted R-squared: 0.7775
## F-statistic: 1712 on 3 and 1466 DF, p-value: < 2.2e-16
# Display a tidy summary
model_multiple %>%
broom::tidy()
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 64049. 308. 208. 0.
## 2 new_hireYes 783. 525. 1.49 1.36e- 1
## 3 job_levelManager 54919. 915. 60.0 0.
## 4 job_levelSalaried 26866. 567. 47.4 7.39e-298
Chapter 4 - Are performance ratings being given consistently?
Joining HR data:
Performance ratings and fairness:
Logistic regression is especially helpful for modeling binary response variables:
Example code includes:
# Import the data
hr_data <- readr::read_csv("./RInputFiles/hr_data.csv")
## Parsed with column specification:
## cols(
## employee_id = col_double(),
## department = col_character(),
## job_level = col_character(),
## gender = col_character()
## )
performance_data <- readr::read_csv("./RInputFiles/performance_data.csv")
## Parsed with column specification:
## cols(
## employee_id = col_double(),
## rating = col_double()
## )
# Examine the datasets
summary(hr_data)
## employee_id department job_level gender
## Min. : 1.0 Length:1470 Length:1470 Length:1470
## 1st Qu.: 491.2 Class :character Class :character Class :character
## Median :1020.5 Mode :character Mode :character Mode :character
## Mean :1024.9
## 3rd Qu.:1555.8
## Max. :2068.0
summary(performance_data)
## employee_id rating
## Min. : 1.0 Min. :1.00
## 1st Qu.: 491.2 1st Qu.:2.00
## Median :1020.5 Median :3.00
## Mean :1024.9 Mean :2.83
## 3rd Qu.:1555.8 3rd Qu.:4.00
## Max. :2068.0 Max. :5.00
# Join the two tables
joined_data <- left_join(hr_data, performance_data, by = "employee_id")
# Examine the result
summary(joined_data)
## employee_id department job_level gender
## Min. : 1.0 Length:1470 Length:1470 Length:1470
## 1st Qu.: 491.2 Class :character Class :character Class :character
## Median :1020.5 Mode :character Mode :character Mode :character
## Mean :1024.9
## 3rd Qu.:1555.8
## Max. :2068.0
## rating
## Min. :1.00
## 1st Qu.:2.00
## Median :3.00
## Mean :2.83
## 3rd Qu.:4.00
## Max. :5.00
# Check whether the average performance rating differs by gender
joined_data %>%
group_by(gender) %>%
summarize(avg_rating = mean(rating))
## # A tibble: 2 x 2
## gender avg_rating
## <chr> <dbl>
## 1 Female 2.75
## 2 Male 2.92
# Add the high_performer column
performance <- joined_data %>%
mutate(high_performer = ifelse(rating >= 4, 1, 0))
# Test whether one gender is more likely to be a high performer
chisq.test(performance$gender, performance$high_performer)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: performance$gender and performance$high_performer
## X-squared = 22.229, df = 1, p-value = 2.42e-06
# Do the same test, and tidy the output
chisq.test(performance$gender, performance$high_performer) %>% broom::tidy()
## # A tibble: 1 x 4
## statistic p.value parameter method
## <dbl> <dbl> <int> <chr>
## 1 22.2 0.00000242 1 Pearson's Chi-squared test with Yates' continu~
# Visualize the distribution of high_performer by gender
performance %>%
ggplot(aes(x=gender, fill=factor(high_performer))) +
geom_bar(position="fill")
# Visualize the distribution of all ratings by gender
performance %>%
ggplot(aes(x=gender, fill=factor(rating))) +
geom_bar(position="fill")
# Visualize the distribution of job_level by gender
performance %>%
ggplot(aes(x = gender, fill = job_level)) +
geom_bar(position = "fill")
# Test whether men and women have different job level distributions
chisq.test(performance$gender, performance$job_level)
##
## Pearson's Chi-squared test
##
## data: performance$gender and performance$job_level
## X-squared = 44.506, df = 2, p-value = 2.166e-10
# Visualize the distribution of high_performer by gender, faceted by job level
performance %>%
ggplot(aes(x = gender, fill = factor(high_performer))) +
geom_bar(position = "fill") +
facet_wrap(~ job_level)
# Run a simple logistic regression
logistic_simple <- glm(high_performer ~ gender, family = "binomial", data = performance)
# View the result with summary()
logistic_simple %>%
summary()
##
## Call:
## glm(formula = high_performer ~ gender, family = "binomial", data = performance)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8871 -0.8871 -0.6957 1.4986 1.7535
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.29540 0.08813 -14.699 < 2e-16 ***
## genderMale 0.56596 0.11921 4.748 2.06e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1709.0 on 1469 degrees of freedom
## Residual deviance: 1686.1 on 1468 degrees of freedom
## AIC: 1690.1
##
## Number of Fisher Scoring iterations: 4
# View a tidy version of the result
logistic_simple %>%
broom::tidy()
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.30 0.0881 -14.7 6.58e-49
## 2 genderMale 0.566 0.119 4.75 2.06e- 6
# Run a multiple logistic regression
logistic_multiple <- glm(high_performer ~ gender + job_level, family = "binomial", data = performance)
# View the result with summary() or tidy()
logistic_multiple %>% broom::tidy()
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.69 0.103 -16.5 2.74e-61
## 2 genderMale 0.319 0.129 2.47 1.34e- 2
## 3 job_levelManager 2.74 0.251 10.9 1.01e-27
## 4 job_levelSalaried 1.10 0.141 7.82 5.17e-15
Chapter 5 - Improving employee safety with data
Employee safety - looking at accident rates and drivers:
Focusing on the location of interest:
Explaining the increase in accidents:
Wrap up:
Example code includes:
# Import the data
hr_data <- readr::read_csv("./RInputFiles/hr_data_2.csv")
## Parsed with column specification:
## cols(
## year = col_double(),
## employee_id = col_double(),
## location = col_character(),
## overtime_hours = col_double()
## )
accident_data <- readr::read_csv("./RInputFiles/accident_data.csv")
## Parsed with column specification:
## cols(
## year = col_double(),
## employee_id = col_double(),
## accident_type = col_character()
## )
# Create hr_joined with left_join() and mutate()
hr_joined <- left_join(hr_data, accident_data, by=c("year", "employee_id")) %>%
mutate(had_accident=ifelse(is.na(accident_type), 0, 1))
hr_joined
## # A tibble: 2,940 x 6
## year employee_id location overtime_hours accident_type had_accident
## <dbl> <dbl> <chr> <dbl> <chr> <dbl>
## 1 2016 1 Northwood 14 <NA> 0
## 2 2017 1 Northwood 8 Mild 1
## 3 2016 2 East Valley 8 <NA> 0
## 4 2017 2 East Valley 11 <NA> 0
## 5 2016 4 East Valley 4 <NA> 0
## 6 2017 4 East Valley 2 Mild 1
## 7 2016 5 West River 0 <NA> 0
## 8 2017 5 West River 17 <NA> 0
## 9 2016 7 West River 21 <NA> 0
## 10 2017 7 West River 9 <NA> 0
## # ... with 2,930 more rows
# Find accident rate for each year
hr_joined %>%
group_by(year) %>%
summarize(accident_rate = mean(had_accident))
## # A tibble: 2 x 2
## year accident_rate
## <dbl> <dbl>
## 1 2016 0.0850
## 2 2017 0.120
# Test difference in accident rate between years
chisq.test(hr_joined$year, hr_joined$had_accident)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: hr_joined$year and hr_joined$had_accident
## X-squared = 9.5986, df = 1, p-value = 0.001947
# Which location had the highest acccident rate?
hr_joined %>%
group_by(location) %>%
summarize(accident_rate=mean(had_accident)) %>%
arrange(-accident_rate)
## # A tibble: 4 x 2
## location accident_rate
## <chr> <dbl>
## 1 East Valley 0.128
## 2 Southfield 0.103
## 3 West River 0.0961
## 4 Northwood 0.0831
# Compare annual accident rates by location
accident_rates <- hr_joined %>%
group_by(location, year) %>%
summarize(accident_rate = mean(had_accident))
accident_rates
## # A tibble: 8 x 3
## # Groups: location [4]
## location year accident_rate
## <chr> <dbl> <dbl>
## 1 East Valley 2016 0.113
## 2 East Valley 2017 0.143
## 3 Northwood 2016 0.0644
## 4 Northwood 2017 0.102
## 5 Southfield 2016 0.0764
## 6 Southfield 2017 0.130
## 7 West River 2016 0.0824
## 8 West River 2017 0.110
# Graph it
accident_rates %>%
ggplot(aes(factor(year), accident_rate)) +
geom_col() +
facet_wrap(~location)
# Filter out the other locations
southfield <- hr_joined %>%
filter(location == "Southfield")
# Find the average overtime hours worked by year
southfield %>%
group_by(year) %>%
summarize(average_overtime_hours = mean(overtime_hours))
## # A tibble: 2 x 2
## year average_overtime_hours
## <dbl> <dbl>
## 1 2016 8.67
## 2 2017 9.97
# Test difference in Southfield's overtime hours between years
t.test(overtime_hours ~ year, data=southfield)
##
## Welch Two Sample t-test
##
## data: overtime_hours by year
## t = -1.6043, df = 595.46, p-value = 0.1092
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.904043 0.292747
## sample estimates:
## mean in group 2016 mean in group 2017
## 8.667774 9.973422
# Import the survey data
survey_data <- readr::read_csv("./RInputFiles/survey_data_2.csv")
## Parsed with column specification:
## cols(
## year = col_double(),
## employee_id = col_double(),
## engagement = col_double()
## )
# Create the safety dataset
safety <- left_join(hr_joined, survey_data, by=c("employee_id", "year")) %>%
mutate(disengaged=ifelse(engagement <= 2, 1, 0), year=factor(year))
# Visualize the difference in % disengaged by year in Southfield
safety %>%
filter(location=="Southfield") %>%
ggplot(aes(x = year, fill = factor(disengaged))) +
geom_bar(position = "fill")
# Test whether one year had significantly more disengaged employees
southSafety <- safety %>%
filter(location=="Southfield")
chisq.test(southSafety$disengaged, southSafety$year)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: southSafety$disengaged and southSafety$year
## X-squared = 7.1906, df = 1, p-value = 0.007329
# Filter out Southfield
other_locs <- safety %>%
filter(location != "Southfield")
# Test whether one year had significantly more overtime hours worked
t.test(overtime_hours ~ year, data = other_locs)
##
## Welch Two Sample t-test
##
## data: overtime_hours by year
## t = -0.48267, df = 2320.3, p-value = 0.6294
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.9961022 0.6026035
## sample estimates:
## mean in group 2016 mean in group 2017
## 9.278015 9.474765
# Test whether one year had significantly more disengaged employees
chisq.test(other_locs$year, other_locs$disengaged)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: other_locs$year and other_locs$disengaged
## X-squared = 0.0023091, df = 1, p-value = 0.9617
# Use multiple regression to test the impact of year and disengaged on accident rate in Southfield
regression <- glm(had_accident ~ year + disengaged, family = "binomial", data = southSafety)
# Examine the output
regression %>% broom::tidy()
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -2.92 0.250 -11.7 1.74e-31
## 2 year2017 0.440 0.285 1.55 1.22e- 1
## 3 disengaged 1.44 0.278 5.19 2.13e- 7
Chapter 1 - Cars Data
Making predictions using machine learning:
Getting started with caret:
Sampling data:
Example code includes:
cars2018 <- readr::read_csv("./RInputFiles/cars2018.csv")
## Parsed with column specification:
## cols(
## Model = col_character(),
## `Model Index` = col_integer(),
## Displacement = col_double(),
## Cylinders = col_integer(),
## Gears = col_integer(),
## Transmission = col_character(),
## MPG = col_integer(),
## Aspiration = col_character(),
## `Lockup Torque Converter` = col_character(),
## Drive = col_character(),
## `Max Ethanol` = col_integer(),
## `Recommended Fuel` = col_character(),
## `Intake Valves Per Cyl` = col_integer(),
## `Exhaust Valves Per Cyl` = col_integer(),
## `Fuel injection` = col_character()
## )
str(cars2018, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1144 obs. of 15 variables:
## $ Model : chr "Acura NSX" "ALFA ROMEO 4C" "Audi R8 AWD" "Audi R8 RWD" ...
## $ Model Index : int 57 410 65 71 66 72 46 488 38 278 ...
## $ Displacement : num 3.5 1.8 5.2 5.2 5.2 5.2 2 3 8 6.2 ...
## $ Cylinders : int 6 4 10 10 10 10 4 6 16 8 ...
## $ Gears : int 9 6 7 7 7 7 6 7 7 8 ...
## $ Transmission : chr "Manual" "Manual" "Manual" "Manual" ...
## $ MPG : int 21 28 17 18 17 18 26 20 11 18 ...
## $ Aspiration : chr "Turbocharged/Supercharged" "Turbocharged/Supercharged" "Naturally Aspirated" "Naturally Aspirated" ...
## $ Lockup Torque Converter: chr "Y" "Y" "Y" "Y" ...
## $ Drive : chr "All Wheel Drive" "2-Wheel Drive, Rear" "All Wheel Drive" "2-Wheel Drive, Rear" ...
## $ Max Ethanol : int 10 10 15 15 15 15 15 10 15 10 ...
## $ Recommended Fuel : chr "Premium Unleaded Required" "Premium Unleaded Required" "Premium Unleaded Recommended" "Premium Unleaded Recommended" ...
## $ Intake Valves Per Cyl : int 2 2 2 2 2 2 2 2 2 1 ...
## $ Exhaust Valves Per Cyl : int 2 2 2 2 2 2 2 2 2 1 ...
## $ Fuel injection : chr "Direct ignition" "Direct ignition" "Direct ignition" "Direct ignition" ...
summary(cars2018)
## Model Model Index Displacement Cylinders
## Length:1144 Min. : 1.0 Min. :1.000 Min. : 3.000
## Class :character 1st Qu.: 36.0 1st Qu.:2.000 1st Qu.: 4.000
## Mode :character Median :108.0 Median :3.000 Median : 6.000
## Mean :201.3 Mean :3.087 Mean : 5.564
## 3rd Qu.:323.8 3rd Qu.:3.600 3rd Qu.: 6.000
## Max. :821.0 Max. :8.000 Max. :16.000
## Gears Transmission MPG Aspiration
## Min. : 1.000 Length:1144 Min. :11.0 Length:1144
## 1st Qu.: 6.000 Class :character 1st Qu.:19.0 Class :character
## Median : 7.000 Mode :character Median :23.0 Mode :character
## Mean : 6.935 Mean :23.2
## 3rd Qu.: 8.000 3rd Qu.:26.0
## Max. :10.000 Max. :58.0
## Lockup Torque Converter Drive Max Ethanol
## Length:1144 Length:1144 Min. :10.00
## Class :character Class :character 1st Qu.:10.00
## Mode :character Mode :character Median :10.00
## Mean :15.29
## 3rd Qu.:15.00
## Max. :85.00
## Recommended Fuel Intake Valves Per Cyl Exhaust Valves Per Cyl
## Length:1144 Min. :1.000 Min. :1.000
## Class :character 1st Qu.:2.000 1st Qu.:2.000
## Mode :character Median :2.000 Median :2.000
## Mean :1.926 Mean :1.922
## 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :2.000 Max. :2.000
## Fuel injection
## Length:1144
## Class :character
## Mode :character
##
##
##
# Print the cars2018 object
cars2018
## # A tibble: 1,144 x 15
## Model `Model Index` Displacement Cylinders Gears Transmission MPG
## <chr> <int> <dbl> <int> <int> <chr> <int>
## 1 Acura NSX 57 3.50 6 9 Manual 21
## 2 ALFA ROM~ 410 1.80 4 6 Manual 28
## 3 Audi R8 ~ 65 5.20 10 7 Manual 17
## 4 Audi R8 ~ 71 5.20 10 7 Manual 18
## 5 Audi R8 ~ 66 5.20 10 7 Manual 17
## 6 Audi R8 ~ 72 5.20 10 7 Manual 18
## 7 Audi TT ~ 46 2.00 4 6 Manual 26
## 8 BMW M4 D~ 488 3.00 6 7 Manual 20
## 9 Bugatti ~ 38 8.00 16 7 Manual 11
## 10 Chevrole~ 278 6.20 8 8 Automatic 18
## # ... with 1,134 more rows, and 8 more variables: Aspiration <chr>,
## # `Lockup Torque Converter` <chr>, Drive <chr>, `Max Ethanol` <int>,
## # `Recommended Fuel` <chr>, `Intake Valves Per Cyl` <int>, `Exhaust
## # Valves Per Cyl` <int>, `Fuel injection` <chr>
# Plot the histogram
ggplot(cars2018, aes(x = MPG)) +
geom_histogram(bins = 25) +
labs(y = "Number of cars",
x = "Fuel efficiency (mpg)")
# Deselect the 2 columns to create cars_vars
cars_vars <- cars2018 %>%
select(-Model, -`Model Index`)
# Fit a linear model
fit_all <- lm(MPG ~ ., data = cars_vars)
# Print the summary of the model
summary(fit_all)
##
## Call:
## lm(formula = MPG ~ ., data = cars_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5261 -1.6473 -0.1096 1.3572 26.5045
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 44.539519 1.176283
## Displacement -3.786147 0.264845
## Cylinders 0.520284 0.161802
## Gears 0.157674 0.069984
## TransmissionCVT 4.877637 0.404051
## TransmissionManual -1.074608 0.366075
## AspirationTurbocharged/Supercharged -2.190248 0.267559
## `Lockup Torque Converter`Y -2.624494 0.381252
## Drive2-Wheel Drive, Rear -2.676716 0.291044
## Drive4-Wheel Drive -3.397532 0.335147
## DriveAll Wheel Drive -2.941084 0.257174
## `Max Ethanol` -0.007377 0.005898
## `Recommended Fuel`Premium Unleaded Required -0.403935 0.262413
## `Recommended Fuel`Regular Unleaded Recommended -0.996343 0.272495
## `Intake Valves Per Cyl` -1.446107 1.620575
## `Exhaust Valves Per Cyl` -2.469747 1.547748
## `Fuel injection`Multipoint/sequential ignition -0.658428 0.243819
## t value Pr(>|t|)
## (Intercept) 37.865 < 2e-16 ***
## Displacement -14.296 < 2e-16 ***
## Cylinders 3.216 0.001339 **
## Gears 2.253 0.024450 *
## TransmissionCVT 12.072 < 2e-16 ***
## TransmissionManual -2.935 0.003398 **
## AspirationTurbocharged/Supercharged -8.186 7.24e-16 ***
## `Lockup Torque Converter`Y -6.884 9.65e-12 ***
## Drive2-Wheel Drive, Rear -9.197 < 2e-16 ***
## Drive4-Wheel Drive -10.137 < 2e-16 ***
## DriveAll Wheel Drive -11.436 < 2e-16 ***
## `Max Ethanol` -1.251 0.211265
## `Recommended Fuel`Premium Unleaded Required -1.539 0.124010
## `Recommended Fuel`Regular Unleaded Recommended -3.656 0.000268 ***
## `Intake Valves Per Cyl` -0.892 0.372400
## `Exhaust Valves Per Cyl` -1.596 0.110835
## `Fuel injection`Multipoint/sequential ignition -2.700 0.007028 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.916 on 1127 degrees of freedom
## Multiple R-squared: 0.7314, Adjusted R-squared: 0.7276
## F-statistic: 191.8 on 16 and 1127 DF, p-value: < 2.2e-16
# Load caret
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# Split the data into training and test sets
set.seed(1234)
in_train <- createDataPartition(cars_vars$Transmission, p = 0.8, list = FALSE)
training <- cars_vars[in_train, ]
testing <- cars_vars[-in_train, ]
# Train a linear regression model
fit_lm <- train(log(MPG) ~ ., method = "lm", data = training,
trControl = trainControl(method = "none"))
# Print the model object
fit_lm
## Linear Regression
##
## 916 samples
## 12 predictor
##
## No pre-processing
## Resampling: None
# Train a random forest model
fit_rf <- train(log(MPG) ~ ., method = "rf", data = training,
trControl = trainControl(method = "none"))
# Print the model object
fit_rf
## Random Forest
##
## 916 samples
## 12 predictor
##
## No pre-processing
## Resampling: None
# Create the new columns
results <- training %>%
mutate(`Linear regression` = predict(fit_lm, training),
`Random forest` = predict(fit_rf, training))
# Evaluate the performance
yardstick::metrics(results, truth = MPG, estimate = `Linear regression`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.9 0.702
yardstick::metrics(results, truth = MPG, estimate = `Random forest`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.9 0.845
# Create the new columns
results <- testing %>%
mutate(`Linear regression` = predict(fit_lm, testing),
`Random forest` = predict(fit_rf, testing))
# Evaluate the performance
yardstick::metrics(results, truth = MPG, estimate = `Linear regression`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.5 0.799
yardstick::metrics(results, truth = MPG, estimate = `Random forest`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.5 0.880
# Fit the models with bootstrap resampling
cars_lm_bt <- train(log(MPG) ~ ., method = "lm", data = training,
trControl = trainControl(method = "boot"))
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
cars_rf_bt <- train(log(MPG) ~ ., method = "rf", data = training,
trControl = trainControl(method = "boot"))
# Quick look at the models
cars_lm_bt
## Linear Regression
##
## 916 samples
## 12 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 916, 916, 916, 916, 916, 916, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.1036278 0.7890514 0.07656104
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
cars_rf_bt
## Random Forest
##
## 916 samples
## 12 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 916, 916, 916, 916, 916, 916, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 0.10015480 0.8205322 0.07299305
## 9 0.08758544 0.8466598 0.06129895
## 16 0.09100659 0.8360034 0.06313542
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 9.
results <- testing %>%
mutate(`Linear regression` = predict(cars_lm_bt, testing),
`Random forest` = predict(cars_rf_bt, testing))
yardstick::metrics(results, truth = MPG, estimate = `Linear regression`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.5 0.799
yardstick::metrics(results, truth = MPG, estimate = `Random forest`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.5 0.903
results %>%
gather(Method, Result, `Linear regression`:`Random forest`) %>%
ggplot(aes(log(MPG), Result, color = Method)) +
geom_point(size = 1.5, alpha = 0.5) +
facet_wrap(~Method) +
geom_abline(lty = 2, color = "gray50") +
geom_smooth(method = "lm")
Chapter 2 - Stack Overflow Developer Data
Essential copying and pasting from Stack Overflow (largest and most trusted developer community):
Dealing with imbalanced data:
Predicting remote status:
Logistic regression)Logistic regression)Logistic regression)Example code includes:
stackoverflow <- readr::read_csv("./RInputFiles/stackoverflow.csv")
## Parsed with column specification:
## cols(
## .default = col_logical(),
## Respondent = col_integer(),
## Country = col_character(),
## Salary = col_double(),
## YearsCodedJob = col_integer(),
## CompanySizeNumber = col_double(),
## Remote = col_character(),
## CareerSatisfaction = col_integer()
## )
## See spec(...) for full column specifications.
stackoverflow$Remote <- factor(stackoverflow$Remote, levels=c("Not remote", "Remote"))
str(stackoverflow, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 6991 obs. of 22 variables:
## $ Respondent : int 3 15 18 19 26 55 62 71 73 77 ...
## $ Country : chr "United Kingdom" "United Kingdom" "United States" "United States" ...
## $ Salary : num 113750 100000 130000 82500 175000 ...
## $ YearsCodedJob : int 20 20 20 3 16 4 1 1 20 20 ...
## $ OpenSource : logi TRUE FALSE TRUE FALSE FALSE FALSE ...
## $ Hobby : logi TRUE TRUE TRUE TRUE TRUE FALSE ...
## $ CompanySizeNumber : num 10000 5000 1000 10000 10000 1000 5000 20 100 1000 ...
## $ Remote : Factor w/ 2 levels "Not remote","Remote": 1 2 2 1 1 1 1 1 2 2 ...
## $ CareerSatisfaction : int 8 8 9 5 7 9 5 8 8 10 ...
## $ Data scientist : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Database administrator : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Desktop applications developer : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Developer with stats/math background: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ DevOps : logi FALSE FALSE TRUE FALSE FALSE FALSE ...
## $ Embedded developer : logi FALSE TRUE TRUE FALSE FALSE FALSE ...
## $ Graphic designer : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Graphics programming : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Machine learning specialist : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Mobile developer : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Quality assurance engineer : logi FALSE FALSE TRUE FALSE FALSE FALSE ...
## $ Systems administrator : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Web developer : logi FALSE FALSE TRUE TRUE TRUE TRUE ...
# Print stackoverflow
stackoverflow
## # A tibble: 6,991 x 22
## Respondent Country Salary YearsCodedJob OpenSource Hobby
## <int> <chr> <dbl> <int> <lgl> <lgl>
## 1 3 United Kingdom 113750 20 T T
## 2 15 United Kingdom 100000 20 F T
## 3 18 United States 130000 20 T T
## 4 19 United States 82500 3 F T
## 5 26 United States 175000 16 F T
## 6 55 Germany 64516 4 F F
## 7 62 India 6636 1 F T
## 8 71 United States 65000 1 F T
## 9 73 United States 120000 20 T T
## 10 77 United States 96283 20 T T
## # ... with 6,981 more rows, and 16 more variables:
## # CompanySizeNumber <dbl>, Remote <fct>, CareerSatisfaction <int>, `Data
## # scientist` <lgl>, `Database administrator` <lgl>, `Desktop
## # applications developer` <lgl>, `Developer with stats/math
## # background` <lgl>, DevOps <lgl>, `Embedded developer` <lgl>, `Graphic
## # designer` <lgl>, `Graphics programming` <lgl>, `Machine learning
## # specialist` <lgl>, `Mobile developer` <lgl>, `Quality assurance
## # engineer` <lgl>, `Systems administrator` <lgl>, `Web developer` <lgl>
# First count for Remote
stackoverflow %>%
count(Remote, sort = TRUE)
## # A tibble: 2 x 2
## Remote n
## <fct> <int>
## 1 Not remote 6273
## 2 Remote 718
# then count for Country
stackoverflow %>%
count(Country, sort = TRUE)
## # A tibble: 5 x 2
## Country n
## <chr> <int>
## 1 United States 3486
## 2 United Kingdom 1270
## 3 Germany 950
## 4 India 666
## 5 Canada 619
ggplot(stackoverflow, aes(x=Remote, y=YearsCodedJob)) +
geom_boxplot() +
labs(x = NULL,
y = "Years of professional coding experience")
# Build a simple logistic regression model
simple_glm <- stackoverflow %>%
select(-Respondent) %>%
glm(Remote ~ .,
family = "binomial",
data = .)
# Print the summary of the model
summary(simple_glm)
##
## Call:
## glm(formula = Remote ~ ., family = "binomial", data = .)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1942 -0.4971 -0.3824 -0.2867 2.9118
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -4.156e+00 2.929e-01 -14.187
## CountryGermany -2.034e-01 2.196e-01 -0.927
## CountryIndia 9.574e-01 2.220e-01 4.312
## CountryUnited Kingdom 5.599e-02 1.974e-01 0.284
## CountryUnited States 5.990e-01 1.799e-01 3.330
## Salary 4.076e-06 1.589e-06 2.565
## YearsCodedJob 7.133e-02 7.556e-03 9.440
## OpenSourceTRUE 4.207e-01 8.555e-02 4.917
## HobbyTRUE 8.330e-03 9.827e-02 0.085
## CompanySizeNumber -6.104e-05 1.223e-05 -4.990
## CareerSatisfaction 6.748e-02 2.664e-02 2.533
## `Data scientist`TRUE -1.186e-01 1.838e-01 -0.645
## `Database administrator`TRUE 2.763e-01 1.267e-01 2.181
## `Desktop applications developer`TRUE -2.903e-01 9.842e-02 -2.950
## `Developer with stats/math background`TRUE 2.840e-02 1.359e-01 0.209
## DevOpsTRUE -1.532e-01 1.292e-01 -1.185
## `Embedded developer`TRUE -2.777e-01 1.653e-01 -1.680
## `Graphic designer`TRUE -1.904e-01 2.725e-01 -0.699
## `Graphics programming`TRUE 1.078e-01 2.312e-01 0.466
## `Machine learning specialist`TRUE -2.289e-01 2.769e-01 -0.827
## `Mobile developer`TRUE 2.170e-01 1.019e-01 2.130
## `Quality assurance engineer`TRUE -2.826e-01 2.437e-01 -1.160
## `Systems administrator`TRUE 1.462e-01 1.421e-01 1.029
## `Web developer`TRUE 1.158e-01 9.993e-02 1.159
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## CountryGermany 0.354161
## CountryIndia 1.62e-05 ***
## CountryUnited Kingdom 0.776710
## CountryUnited States 0.000868 ***
## Salary 0.010314 *
## YearsCodedJob < 2e-16 ***
## OpenSourceTRUE 8.78e-07 ***
## HobbyTRUE 0.932444
## CompanySizeNumber 6.04e-07 ***
## CareerSatisfaction 0.011323 *
## `Data scientist`TRUE 0.518709
## `Database administrator`TRUE 0.029184 *
## `Desktop applications developer`TRUE 0.003178 **
## `Developer with stats/math background`TRUE 0.834400
## DevOpsTRUE 0.235833
## `Embedded developer`TRUE 0.093039 .
## `Graphic designer`TRUE 0.484596
## `Graphics programming`TRUE 0.641060
## `Machine learning specialist`TRUE 0.408484
## `Mobile developer`TRUE 0.033194 *
## `Quality assurance engineer`TRUE 0.246098
## `Systems administrator`TRUE 0.303507
## `Web developer`TRUE 0.246655
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4627.8 on 6990 degrees of freedom
## Residual deviance: 4268.8 on 6967 degrees of freedom
## AIC: 4316.8
##
## Number of Fisher Scoring iterations: 5
stack_select <- stackoverflow %>%
select(-Respondent)
# Split the data into training and testing sets
set.seed(1234)
in_train <- caret::createDataPartition(stack_select$Remote, p=0.8, list = FALSE)
training <- stack_select[in_train,]
testing <- stack_select[-in_train,]
up_train <- caret::upSample(x = select(training, -Remote), y = training$Remote, yname = "Remote") %>%
as_tibble()
up_train %>%
count(Remote)
## # A tibble: 2 x 2
## Remote n
## <fct> <int>
## 1 Not remote 5019
## 2 Remote 5019
# Sub-sample to 5% of original
inUse <- sample(1:nrow(training), round(0.05*nrow(training)), replace=FALSE)
useTrain <- training[sort(inUse), ]
# Build a logistic regression model
stack_glm <- caret::train(Remote ~ ., method="glm", family="binomial", data = training,
trControl = trainControl(method = "boot", sampling = "up")
)
# Print the model object
stack_glm
## Generalized Linear Model
##
## 5594 samples
## 20 predictor
## 2 classes: 'Not remote', 'Remote'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 5594, 5594, 5594, 5594, 5594, 5594, ...
## Addtional sampling using up-sampling
##
## Resampling results:
##
## Accuracy Kappa
## 0.6568743 0.1279825
# Build a random forest model
stack_rf <- caret::train(Remote ~ ., method="rf", data = useTrain,
trControl = trainControl(method = "boot", sampling="up")
)
# Print the model object
stack_rf
## Random Forest
##
## 280 samples
## 20 predictor
## 2 classes: 'Not remote', 'Remote'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 280, 280, 280, 280, 280, 280, ...
## Addtional sampling using up-sampling
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8626254 0.110738058
## 12 0.9038825 -0.002127159
## 23 0.8887612 0.035777206
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 12.
# Confusion matrix for logistic regression model
caret::confusionMatrix(predict(stack_glm, testing), testing$Remote)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not remote Remote
## Not remote 837 53
## Remote 417 90
##
## Accuracy : 0.6636
## 95% CI : (0.6381, 0.6883)
## No Information Rate : 0.8976
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1395
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6675
## Specificity : 0.6294
## Pos Pred Value : 0.9404
## Neg Pred Value : 0.1775
## Prevalence : 0.8976
## Detection Rate : 0.5991
## Detection Prevalence : 0.6371
## Balanced Accuracy : 0.6484
##
## 'Positive' Class : Not remote
##
# Confusion matrix for random forest model
caret::confusionMatrix(predict(stack_rf, testing), testing$Remote)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not remote Remote
## Not remote 1207 125
## Remote 47 18
##
## Accuracy : 0.8769
## 95% CI : (0.8585, 0.8937)
## No Information Rate : 0.8976
## P-Value [Acc > NIR] : 0.9945
##
## Kappa : 0.1166
## Mcnemar's Test P-Value : 4.327e-09
##
## Sensitivity : 0.9625
## Specificity : 0.1259
## Pos Pred Value : 0.9062
## Neg Pred Value : 0.2769
## Prevalence : 0.8976
## Detection Rate : 0.8640
## Detection Prevalence : 0.9535
## Balanced Accuracy : 0.5442
##
## 'Positive' Class : Not remote
##
# Predict values
testing_results <- testing %>%
mutate(`Logistic regression` = predict(stack_glm, testing), `Random forest` = predict(stack_rf, testing))
## Calculate accuracy
yardstick::accuracy(testing_results, truth = Remote, estimate = `Logistic regression`)
## [1] 0.6635648
yardstick::accuracy(testing_results, truth = Remote, estimate = `Random forest`)
## [1] 0.876879
## Calculate positive predict value
yardstick::ppv(testing_results, truth = Remote, estimate = `Logistic regression`)
## [1] 0.9404494
yardstick::ppv(testing_results, truth = Remote, estimate = `Random forest`)
## [1] 0.9061562
Chapter 3 - Voting
Predicting voter turnout from survey data:
Vote 2016:
Cross-validation is the process of sub-dividing the data into folds, with each fold used once as the validation set:
Comparing model performance:
Example code includes:
voters <- readr::read_csv("./RInputFiles/voters.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## turnout16_2016 = col_character()
## )
## See spec(...) for full column specifications.
voters$turnout16_2016 <- factor(voters$turnout16_2016, levels=c("Did not vote", "Voted"))
str(voters, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 6692 obs. of 43 variables:
## $ case_identifier : int 779 2108 2597 4148 4460 5225 5903 6059 8048 13112 ...
## $ turnout16_2016 : Factor w/ 2 levels "Did not vote",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ RIGGED_SYSTEM_1_2016: int 3 2 2 1 3 3 3 2 4 2 ...
## $ RIGGED_SYSTEM_2_2016: int 4 1 4 4 1 3 4 3 4 3 ...
## $ RIGGED_SYSTEM_3_2016: int 1 3 1 1 3 2 1 3 1 1 ...
## $ RIGGED_SYSTEM_4_2016: int 4 1 4 4 1 2 1 2 3 2 ...
## $ RIGGED_SYSTEM_5_2016: int 3 3 1 2 3 2 2 1 3 2 ...
## $ RIGGED_SYSTEM_6_2016: int 2 2 1 1 2 3 1 2 1 2 ...
## $ track_2016 : int 2 2 1 1 2 2 1 2 2 2 ...
## $ persfinretro_2016 : int 2 3 3 1 2 2 2 3 2 1 ...
## $ econtrend_2016 : int 1 3 3 1 2 2 1 3 1 1 ...
## $ Americatrend_2016 : int 1 1 1 3 3 1 2 3 2 1 ...
## $ futuretrend_2016 : int 4 1 1 3 4 3 1 3 1 1 ...
## $ wealth_2016 : int 2 1 2 2 1 2 2 1 2 2 ...
## $ values_culture_2016 : int 2 3 3 3 3 2 3 3 1 3 ...
## $ US_respect_2016 : int 2 3 1 1 2 2 2 3 3 3 ...
## $ trustgovt_2016 : int 3 3 3 3 3 2 3 3 3 3 ...
## $ trust_people_2016 : int 8 2 1 1 1 2 2 1 2 1 ...
## $ helpful_people_2016 : int 1 1 2 1 1 1 2 2 1 2 ...
## $ fair_people_2016 : int 8 2 1 1 1 2 2 1 2 1 ...
## $ imiss_a_2016 : int 2 1 1 1 1 2 1 1 3 1 ...
## $ imiss_b_2016 : int 2 1 1 2 1 1 1 2 1 1 ...
## $ imiss_c_2016 : int 1 2 2 3 1 2 2 1 4 2 ...
## $ imiss_d_2016 : int 1 2 1 1 1 1 1 2 1 1 ...
## $ imiss_e_2016 : int 1 1 3 1 1 3 1 2 1 1 ...
## $ imiss_f_2016 : int 2 1 1 2 1 2 1 3 2 1 ...
## $ imiss_g_2016 : int 1 4 3 3 3 1 3 4 2 2 ...
## $ imiss_h_2016 : int 1 2 2 2 1 1 1 2 1 1 ...
## $ imiss_i_2016 : int 2 2 4 4 2 1 1 3 2 1 ...
## $ imiss_j_2016 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ imiss_k_2016 : int 1 2 1 1 2 1 1 4 2 1 ...
## $ imiss_l_2016 : int 1 4 1 2 4 1 1 3 1 1 ...
## $ imiss_m_2016 : int 1 2 1 2 1 1 1 1 1 1 ...
## $ imiss_n_2016 : int 1 2 1 1 1 1 1 2 2 1 ...
## $ imiss_o_2016 : int 2 1 1 1 1 2 1 2 2 1 ...
## $ imiss_p_2016 : int 2 1 2 3 1 3 1 1 4 1 ...
## $ imiss_q_2016 : int 1 1 1 2 2 1 1 4 2 1 ...
## $ imiss_r_2016 : int 2 1 1 2 1 2 1 2 4 2 ...
## $ imiss_s_2016 : int 1 2 1 2 2 1 1 1 1 1 ...
## $ imiss_t_2016 : int 1 1 3 3 1 1 3 4 1 1 ...
## $ imiss_u_2016 : int 2 2 2 2 1 3 3 1 4 2 ...
## $ imiss_x_2016 : int 1 3 1 2 1 1 1 4 1 1 ...
## $ imiss_y_2016 : int 1 4 2 3 1 1 1 3 2 1 ...
# Print voters
voters
## # A tibble: 6,692 x 43
## case_identifier turnout16_2016 RIGGED_SYSTEM_1_2016 RIGGED_SYSTEM_2_20~
## <int> <fct> <int> <int>
## 1 779 Voted 3 4
## 2 2108 Voted 2 1
## 3 2597 Voted 2 4
## 4 4148 Voted 1 4
## 5 4460 Voted 3 1
## 6 5225 Voted 3 3
## 7 5903 Voted 3 4
## 8 6059 Voted 2 3
## 9 8048 Voted 4 4
## 10 13112 Voted 2 3
## # ... with 6,682 more rows, and 39 more variables:
## # RIGGED_SYSTEM_3_2016 <int>, RIGGED_SYSTEM_4_2016 <int>,
## # RIGGED_SYSTEM_5_2016 <int>, RIGGED_SYSTEM_6_2016 <int>,
## # track_2016 <int>, persfinretro_2016 <int>, econtrend_2016 <int>,
## # Americatrend_2016 <int>, futuretrend_2016 <int>, wealth_2016 <int>,
## # values_culture_2016 <int>, US_respect_2016 <int>,
## # trustgovt_2016 <int>, trust_people_2016 <int>,
## # helpful_people_2016 <int>, fair_people_2016 <int>, imiss_a_2016 <int>,
## # imiss_b_2016 <int>, imiss_c_2016 <int>, imiss_d_2016 <int>,
## # imiss_e_2016 <int>, imiss_f_2016 <int>, imiss_g_2016 <int>,
## # imiss_h_2016 <int>, imiss_i_2016 <int>, imiss_j_2016 <int>,
## # imiss_k_2016 <int>, imiss_l_2016 <int>, imiss_m_2016 <int>,
## # imiss_n_2016 <int>, imiss_o_2016 <int>, imiss_p_2016 <int>,
## # imiss_q_2016 <int>, imiss_r_2016 <int>, imiss_s_2016 <int>,
## # imiss_t_2016 <int>, imiss_u_2016 <int>, imiss_x_2016 <int>,
## # imiss_y_2016 <int>
# How many people voted?
voters %>%
count(turnout16_2016)
## # A tibble: 2 x 2
## turnout16_2016 n
## <fct> <int>
## 1 Did not vote 264
## 2 Voted 6428
# How do the reponses on the survey vary with voting behavior?
voters %>%
group_by(turnout16_2016) %>%
summarize(`Elections don't matter` = mean(RIGGED_SYSTEM_1_2016 <= 2),
`Economy is getting better` = mean(econtrend_2016 == 1),
`Crime is very important` = mean(imiss_a_2016 == 2))
## # A tibble: 2 x 4
## turnout16_2016 `Elections don't ~ `Economy is gettin~ `Crime is very im~
## <fct> <dbl> <dbl> <dbl>
## 1 Did not vote 0.553 0.163 0.292
## 2 Voted 0.341 0.289 0.342
## Visualize difference by voter turnout
voters %>%
ggplot(aes(econtrend_2016, ..density.., fill = turnout16_2016)) +
geom_histogram(alpha = 0.5, position = "identity", binwidth = 1) +
labs(title = "Overall, is the economy getting better or worse?")
# Remove the case_indetifier column
voters_select <- voters %>%
select(-case_identifier)
# Build a simple logistic regression model
simple_glm <- glm(turnout16_2016 ~ ., family = "binomial",
data = voters_select)
# Print the summary
summary(simple_glm)
##
## Call:
## glm(formula = turnout16_2016 ~ ., family = "binomial", data = voters_select)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2373 0.1651 0.2214 0.3004 1.7708
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.457036 0.732721 3.353 0.000799 ***
## RIGGED_SYSTEM_1_2016 0.236284 0.085081 2.777 0.005484 **
## RIGGED_SYSTEM_2_2016 0.064749 0.089208 0.726 0.467946
## RIGGED_SYSTEM_3_2016 0.049357 0.107352 0.460 0.645680
## RIGGED_SYSTEM_4_2016 -0.074694 0.087583 -0.853 0.393749
## RIGGED_SYSTEM_5_2016 0.190252 0.096454 1.972 0.048556 *
## RIGGED_SYSTEM_6_2016 -0.005881 0.101381 -0.058 0.953740
## track_2016 0.241075 0.121467 1.985 0.047178 *
## persfinretro_2016 -0.040229 0.106714 -0.377 0.706191
## econtrend_2016 -0.295370 0.087224 -3.386 0.000708 ***
## Americatrend_2016 -0.105213 0.080845 -1.301 0.193116
## futuretrend_2016 0.210568 0.071201 2.957 0.003103 **
## wealth_2016 -0.069405 0.026344 -2.635 0.008424 **
## values_culture_2016 -0.041402 0.038670 -1.071 0.284332
## US_respect_2016 -0.068200 0.043785 -1.558 0.119322
## trustgovt_2016 0.315354 0.166655 1.892 0.058456 .
## trust_people_2016 0.040423 0.041518 0.974 0.330236
## helpful_people_2016 -0.037513 0.035353 -1.061 0.288643
## fair_people_2016 -0.017081 0.030170 -0.566 0.571294
## imiss_a_2016 0.397121 0.138987 2.857 0.004273 **
## imiss_b_2016 -0.250803 0.155454 -1.613 0.106667
## imiss_c_2016 0.017536 0.090647 0.193 0.846606
## imiss_d_2016 0.043510 0.122118 0.356 0.721619
## imiss_e_2016 -0.095552 0.078603 -1.216 0.224126
## imiss_f_2016 -0.323280 0.105432 -3.066 0.002168 **
## imiss_g_2016 -0.332034 0.078673 -4.220 2.44e-05 ***
## imiss_h_2016 -0.157298 0.107111 -1.469 0.141954
## imiss_i_2016 0.088695 0.091467 0.970 0.332196
## imiss_j_2016 0.060271 0.138429 0.435 0.663280
## imiss_k_2016 -0.181030 0.082726 -2.188 0.028646 *
## imiss_l_2016 0.274689 0.106781 2.572 0.010098 *
## imiss_m_2016 -0.124269 0.147888 -0.840 0.400746
## imiss_n_2016 -0.441612 0.090040 -4.905 9.36e-07 ***
## imiss_o_2016 0.198635 0.143160 1.388 0.165286
## imiss_p_2016 0.102987 0.105669 0.975 0.329751
## imiss_q_2016 0.244567 0.119093 2.054 0.040017 *
## imiss_r_2016 0.198839 0.121969 1.630 0.103050
## imiss_s_2016 -0.067310 0.134465 -0.501 0.616666
## imiss_t_2016 -0.116757 0.068143 -1.713 0.086639 .
## imiss_u_2016 0.022902 0.097312 0.235 0.813939
## imiss_x_2016 -0.017789 0.097349 -0.183 0.855003
## imiss_y_2016 0.150205 0.094536 1.589 0.112092
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2224.3 on 6691 degrees of freedom
## Residual deviance: 2004.4 on 6650 degrees of freedom
## AIC: 2088.4
##
## Number of Fisher Scoring iterations: 6
# Split data into training and testing sets
set.seed(1234)
in_train <- caret::createDataPartition(voters_select$turnout16_2016, p = 0.8, list = FALSE)
training <- voters_select[in_train, ]
testing <- voters_select[-in_train, ]
# Perform logistic regression with upsampling and no resampling
vote_glm_1 <- caret::train(turnout16_2016 ~ ., method = "glm", family = "binomial", data = training,
trControl = trainControl(method = "none", sampling = "up")
)
# Print vote_glm
vote_glm_1
## Generalized Linear Model
##
## 5355 samples
## 41 predictor
## 2 classes: 'Did not vote', 'Voted'
##
## No pre-processing
## Resampling: None
## Addtional sampling using up-sampling
useSmall <- sort(sample(1:nrow(training), round(0.1*nrow(training)), replace=FALSE))
trainSmall <- training[useSmall, ]
# Logistic regression
vote_glm <- caret::train(turnout16_2016 ~ ., method = "glm", family = "binomial", data = trainSmall,
trControl = trainControl(method = "repeatedcv", repeats = 2, sampling = "up")
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Print vote_glm
vote_glm
## Generalized Linear Model
##
## 536 samples
## 41 predictor
## 2 classes: 'Did not vote', 'Voted'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times)
## Summary of sample sizes: 482, 482, 482, 483, 482, 483, ...
## Addtional sampling using up-sampling
##
## Resampling results:
##
## Accuracy Kappa
## 0.8713138 0.04298445
# Random forest
vote_rf <- caret::train(turnout16_2016 ~ ., method = "rf", data = trainSmall,
trControl = trainControl(method="repeatedcv", repeats=2, sampling = "up")
)
# Print vote_rf
vote_rf
## Random Forest
##
## 536 samples
## 41 predictor
## 2 classes: 'Did not vote', 'Voted'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times)
## Summary of sample sizes: 483, 482, 483, 482, 483, 483, ...
## Addtional sampling using up-sampling
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9674179 -0.001265823
## 21 0.9627184 -0.006073829
## 41 0.9542628 0.019107234
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
# Confusion matrix for logistic regression model on training data
caret::confusionMatrix(predict(vote_glm, trainSmall), trainSmall$turnout16_2016)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Did not vote Voted
## Did not vote 17 48
## Voted 0 471
##
## Accuracy : 0.9104
## 95% CI : (0.883, 0.9332)
## No Information Rate : 0.9683
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3836
## Mcnemar's Test P-Value : 1.17e-11
##
## Sensitivity : 1.00000
## Specificity : 0.90751
## Pos Pred Value : 0.26154
## Neg Pred Value : 1.00000
## Prevalence : 0.03172
## Detection Rate : 0.03172
## Detection Prevalence : 0.12127
## Balanced Accuracy : 0.95376
##
## 'Positive' Class : Did not vote
##
# Confusion matrix for random forest model on training data
caret::confusionMatrix(predict(vote_rf, trainSmall), trainSmall$turnout16_2016)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Did not vote Voted
## Did not vote 17 0
## Voted 0 519
##
## Accuracy : 1
## 95% CI : (0.9931, 1)
## No Information Rate : 0.9683
## P-Value [Acc > NIR] : 3.143e-08
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.00000
## Specificity : 1.00000
## Pos Pred Value : 1.00000
## Neg Pred Value : 1.00000
## Prevalence : 0.03172
## Detection Rate : 0.03172
## Detection Prevalence : 0.03172
## Balanced Accuracy : 1.00000
##
## 'Positive' Class : Did not vote
##
# Confusion matrix for logistic regression model on testing data
caret::confusionMatrix(predict(vote_glm, testing), testing$turnout16_2016)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Did not vote Voted
## Did not vote 14 166
## Voted 38 1119
##
## Accuracy : 0.8474
## 95% CI : (0.827, 0.8663)
## No Information Rate : 0.9611
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0642
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.26923
## Specificity : 0.87082
## Pos Pred Value : 0.07778
## Neg Pred Value : 0.96716
## Prevalence : 0.03889
## Detection Rate : 0.01047
## Detection Prevalence : 0.13463
## Balanced Accuracy : 0.57002
##
## 'Positive' Class : Did not vote
##
# Confusion matrix for random forest model on testing data
caret::confusionMatrix(predict(vote_rf, testing), testing$turnout16_2016)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Did not vote Voted
## Did not vote 1 1
## Voted 51 1284
##
## Accuracy : 0.9611
## 95% CI : (0.9493, 0.9708)
## No Information Rate : 0.9611
## P-Value [Acc > NIR] : 0.5368
##
## Kappa : 0.0343
## Mcnemar's Test P-Value : 1.083e-11
##
## Sensitivity : 0.0192308
## Specificity : 0.9992218
## Pos Pred Value : 0.5000000
## Neg Pred Value : 0.9617978
## Prevalence : 0.0388930
## Detection Rate : 0.0007479
## Detection Prevalence : 0.0014959
## Balanced Accuracy : 0.5092263
##
## 'Positive' Class : Did not vote
##
Chapter 4 - Nuns
Catholic sisters survey from 1967 - https://curate.nd.edu/show/0r967368551 with codebook at https://curate.nd.edu/downloads/0v838051f6x
Exploratory data analysis with tidy data:
Predicting age with supervised learning:
Wrap up:
Example code includes:
sisters67 <- readr::read_csv("./RInputFiles/sisters.csv")
## Parsed with column specification:
## cols(
## .default = col_integer()
## )
## See spec(...) for full column specifications.
str(sisters67, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 19278 obs. of 67 variables:
## $ age : int 40 30 40 30 40 30 70 30 60 80 ...
## $ sister: int 11545 16953 73323 75339 36303 95318 22474 114526 20707 91062 ...
## $ v116 : int 5 4 2 4 4 2 4 4 4 5 ...
## $ v117 : int 2 1 2 3 2 4 5 1 5 1 ...
## $ v118 : int 2 4 5 3 3 5 5 4 5 2 ...
## $ v119 : int 2 4 5 4 5 5 5 5 4 1 ...
## $ v120 : int 4 1 3 3 1 1 5 1 5 2 ...
## $ v121 : int 4 1 4 4 4 5 4 1 5 5 ...
## $ v122 : int 4 1 2 2 4 1 1 2 2 1 ...
## $ v123 : int 5 5 3 4 4 3 1 5 2 5 ...
## $ v124 : int 1 1 5 2 3 1 5 3 5 4 ...
## $ v125 : int 4 2 5 3 4 2 5 2 5 5 ...
## $ v126 : int 2 1 1 3 1 1 5 1 5 2 ...
## $ v127 : int 1 4 5 2 2 1 1 1 4 1 ...
## $ v128 : int 2 1 4 3 4 4 5 2 5 3 ...
## $ v129 : int 4 4 5 4 5 4 5 5 4 1 ...
## $ v130 : int 2 4 4 3 3 1 5 1 5 4 ...
## $ v131 : int 1 2 2 3 5 5 2 3 3 2 ...
## $ v132 : int 5 5 5 4 5 2 2 5 4 5 ...
## $ v133 : int 2 4 5 3 5 1 4 2 4 4 ...
## $ v134 : int 2 4 4 3 4 4 1 4 4 2 ...
## $ v135 : int 5 5 4 3 5 4 1 5 5 2 ...
## $ v136 : int 1 4 4 2 4 4 1 4 4 2 ...
## $ v137 : int 1 1 1 1 1 1 2 1 2 4 ...
## $ v138 : int 2 1 3 1 3 1 4 1 2 1 ...
## $ v139 : int 3 1 3 3 1 1 4 1 5 4 ...
## $ v140 : int 1 2 1 2 4 4 5 2 5 2 ...
## $ v141 : int 5 5 4 3 3 3 4 5 4 4 ...
## $ v142 : int 1 1 2 2 2 1 2 1 4 3 ...
## $ v143 : int 2 1 5 4 4 5 4 5 4 1 ...
## $ v144 : int 1 2 1 2 1 1 3 1 4 2 ...
## $ v145 : int 4 4 5 3 4 1 5 2 5 4 ...
## $ v146 : int 4 4 5 4 5 5 4 5 2 4 ...
## $ v147 : int 2 2 1 2 3 1 2 1 2 2 ...
## $ v148 : int 1 1 4 1 1 4 4 1 5 1 ...
## $ v149 : int 4 2 4 2 1 1 2 1 5 4 ...
## $ v150 : int 2 1 2 3 1 4 2 1 5 2 ...
## $ v151 : int 4 1 5 4 4 1 5 1 4 3 ...
## $ v152 : int 2 1 1 3 1 1 2 1 4 4 ...
## $ v153 : int 5 5 5 5 5 5 5 5 5 2 ...
## $ v154 : int 1 1 4 2 1 3 5 1 4 2 ...
## $ v155 : int 5 4 4 3 5 5 4 5 4 4 ...
## $ v156 : int 1 1 2 2 1 1 5 1 5 2 ...
## $ v157 : int 4 1 4 3 1 1 2 1 3 4 ...
## $ v158 : int 4 4 5 2 5 5 2 5 5 4 ...
## $ v159 : int 1 4 4 1 2 1 4 1 4 2 ...
## $ v160 : int 2 5 5 4 4 4 5 5 5 4 ...
## $ v161 : int 2 4 3 3 1 1 4 1 2 4 ...
## $ v162 : int 5 4 5 4 4 4 5 5 5 4 ...
## $ v163 : int 2 1 2 3 1 1 2 1 4 1 ...
## $ v164 : int 4 1 5 2 4 1 5 1 5 4 ...
## $ v165 : int 2 1 3 2 1 1 1 1 2 2 ...
## $ v166 : int 2 4 5 2 1 1 5 2 5 4 ...
## $ v167 : int 2 4 5 3 4 4 2 4 5 2 ...
## $ v168 : int 5 5 5 4 5 5 5 5 4 5 ...
## $ v169 : int 1 1 1 2 1 1 5 1 4 4 ...
## $ v170 : int 5 1 4 3 2 4 4 1 2 4 ...
## $ v171 : int 5 5 5 4 1 2 5 5 5 5 ...
## $ v172 : int 2 1 5 5 2 2 5 1 5 3 ...
## $ v173 : int 2 2 4 2 2 1 4 1 1 4 ...
## $ v174 : int 2 4 2 3 4 1 5 5 4 2 ...
## $ v175 : int 1 1 4 2 2 1 2 1 5 4 ...
## $ v176 : int 4 4 4 3 1 4 4 3 3 2 ...
## $ v177 : int 4 4 5 3 4 2 4 4 4 4 ...
## $ v178 : int 4 1 4 2 1 1 2 1 4 4 ...
## $ v179 : int 4 4 4 3 4 2 4 4 5 4 ...
## $ v180 : int 4 2 5 3 3 1 1 1 1 2 ...
# View sisters67
glimpse(sisters67)
## Observations: 19,278
## Variables: 67
## $ age <int> 40, 30, 40, 30, 40, 30, 70, 30, 60, 80, 90, 40, 60, 80,...
## $ sister <int> 11545, 16953, 73323, 75339, 36303, 95318, 22474, 114526...
## $ v116 <int> 5, 4, 2, 4, 4, 2, 4, 4, 4, 5, 2, 5, 4, 4, 3, 4, 5, 3, 4...
## $ v117 <int> 2, 1, 2, 3, 2, 4, 5, 1, 5, 1, 3, 2, 5, 4, 1, 1, 1, 1, 2...
## $ v118 <int> 2, 4, 5, 3, 3, 5, 5, 4, 5, 2, 4, 4, 4, 5, 2, 4, 4, 4, 2...
## $ v119 <int> 2, 4, 5, 4, 5, 5, 5, 5, 4, 1, 4, 5, 3, 4, 5, 5, 5, 5, 4...
## $ v120 <int> 4, 1, 3, 3, 1, 1, 5, 1, 5, 2, 3, 1, 5, 4, 4, 1, 1, 1, 2...
## $ v121 <int> 4, 1, 4, 4, 4, 5, 4, 1, 5, 5, 4, 1, 3, 4, 3, 2, 5, 3, 3...
## $ v122 <int> 4, 1, 2, 2, 4, 1, 1, 2, 2, 1, 4, 5, 1, 2, 4, 2, 1, 4, 2...
## $ v123 <int> 5, 5, 3, 4, 4, 3, 1, 5, 2, 5, 3, 4, 3, 4, 5, 5, 4, 5, 4...
## $ v124 <int> 1, 1, 5, 2, 3, 1, 5, 3, 5, 4, 4, 1, 3, 2, 1, 1, 3, 2, 2...
## $ v125 <int> 4, 2, 5, 3, 4, 2, 5, 2, 5, 5, 5, 5, 5, 5, 1, 1, 5, 1, 2...
## $ v126 <int> 2, 1, 1, 3, 1, 1, 5, 1, 5, 2, 4, 1, 5, 1, 3, 1, 5, 1, 2...
## $ v127 <int> 1, 4, 5, 2, 2, 1, 1, 1, 4, 1, 4, 1, 3, 5, 2, 1, 1, 2, 2...
## $ v128 <int> 2, 1, 4, 3, 4, 4, 5, 2, 5, 3, 2, 5, 5, 4, 1, 1, 4, 1, 1...
## $ v129 <int> 4, 4, 5, 4, 5, 4, 5, 5, 4, 1, 5, 1, 5, 5, 5, 1, 5, 5, 5...
## $ v130 <int> 2, 4, 4, 3, 3, 1, 5, 1, 5, 4, 5, 5, 1, 4, 1, 1, 4, 3, 2...
## $ v131 <int> 1, 2, 2, 3, 5, 5, 2, 3, 3, 2, 3, 4, 3, 4, 2, 4, 3, 4, 4...
## $ v132 <int> 5, 5, 5, 4, 5, 2, 2, 5, 4, 5, 4, 5, 5, 5, 4, 5, 3, 5, 5...
## $ v133 <int> 2, 4, 5, 3, 5, 1, 4, 2, 4, 4, 5, 1, 1, 1, 2, 4, 3, 1, 2...
## $ v134 <int> 2, 4, 4, 3, 4, 4, 1, 4, 4, 2, 3, 5, 2, 4, 4, 4, 3, 3, 4...
## $ v135 <int> 5, 5, 4, 3, 5, 4, 1, 5, 5, 2, 4, 5, 3, 5, 2, 5, 3, 5, 5...
## $ v136 <int> 1, 4, 4, 2, 4, 4, 1, 4, 4, 2, 4, 4, 4, 4, 2, 2, 4, 2, 2...
## $ v137 <int> 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 5, 1, 3, 1, 1, 1, 1, 1, 1...
## $ v138 <int> 2, 1, 3, 1, 3, 1, 4, 1, 2, 1, 3, 2, 1, 3, 2, 1, 4, 3, 1...
## $ v139 <int> 3, 1, 3, 3, 1, 1, 4, 1, 5, 4, 4, 1, 2, 4, 1, 1, 2, 1, 1...
## $ v140 <int> 1, 2, 1, 2, 4, 4, 5, 2, 5, 2, 2, 1, 5, 2, 1, 4, 1, 2, 2...
## $ v141 <int> 5, 5, 4, 3, 3, 3, 4, 5, 4, 4, 5, 5, 5, 5, 5, 4, 4, 3, 5...
## $ v142 <int> 1, 1, 2, 2, 2, 1, 2, 1, 4, 3, 4, 2, 2, 3, 2, 2, 1, 3, 1...
## $ v143 <int> 2, 1, 5, 4, 4, 5, 4, 5, 4, 1, 4, 5, 5, 2, 5, 5, 3, 3, 5...
## $ v144 <int> 1, 2, 1, 2, 1, 1, 3, 1, 4, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1...
## $ v145 <int> 4, 4, 5, 3, 4, 1, 5, 2, 5, 4, 4, 1, 4, 5, 2, 2, 1, 2, 2...
## $ v146 <int> 4, 4, 5, 4, 5, 5, 4, 5, 2, 4, 4, 4, 4, 4, 2, 5, 3, 5, 4...
## $ v147 <int> 2, 2, 1, 2, 3, 1, 2, 1, 2, 2, 3, 1, 2, 1, 2, 2, 3, 2, 4...
## $ v148 <int> 1, 1, 4, 1, 1, 4, 4, 1, 5, 1, 4, 1, 3, 1, 1, 1, 2, 1, 1...
## $ v149 <int> 4, 2, 4, 2, 1, 1, 2, 1, 5, 4, 4, 2, 5, 1, 1, 2, 5, 2, 1...
## $ v150 <int> 2, 1, 2, 3, 1, 4, 2, 1, 5, 2, 5, 2, 2, 2, 3, 1, 5, 1, 1...
## $ v151 <int> 4, 1, 5, 4, 4, 1, 5, 1, 4, 3, 4, 1, 2, 5, 2, 4, 5, 1, 4...
## $ v152 <int> 2, 1, 1, 3, 1, 1, 2, 1, 4, 4, 4, 1, 4, 3, 4, 1, 1, 1, 2...
## $ v153 <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 3, 2, 5, 5, 4, 5, 5, 5, 4...
## $ v154 <int> 1, 1, 4, 2, 1, 3, 5, 1, 4, 2, 5, 1, 5, 5, 1, 1, 4, 1, 1...
## $ v155 <int> 5, 4, 4, 3, 5, 5, 4, 5, 4, 4, 3, 4, 3, 5, 2, 5, 5, 5, 1...
## $ v156 <int> 1, 1, 2, 2, 1, 1, 5, 1, 5, 2, 5, 1, 1, 4, 1, 1, 3, 1, 1...
## $ v157 <int> 4, 1, 4, 3, 1, 1, 2, 1, 3, 4, 2, 1, 2, 3, 3, 2, 3, 1, 1...
## $ v158 <int> 4, 4, 5, 2, 5, 5, 2, 5, 5, 4, 4, 5, 4, 2, 5, 4, 4, 3, 4...
## $ v159 <int> 1, 4, 4, 1, 2, 1, 4, 1, 4, 2, 4, 1, 3, 2, 1, 1, 2, 1, 1...
## $ v160 <int> 2, 5, 5, 4, 4, 4, 5, 5, 5, 4, 5, 2, 5, 5, 5, 4, 5, 2, 4...
## $ v161 <int> 2, 4, 3, 3, 1, 1, 4, 1, 2, 4, 5, 1, 4, 5, 1, 1, 3, 1, 1...
## $ v162 <int> 5, 4, 5, 4, 4, 4, 5, 5, 5, 4, 4, 5, 5, 5, 3, 4, 5, 5, 5...
## $ v163 <int> 2, 1, 2, 3, 1, 1, 2, 1, 4, 1, 4, 1, 1, 1, 1, 2, 3, 3, 1...
## $ v164 <int> 4, 1, 5, 2, 4, 1, 5, 1, 5, 4, 4, 1, 1, 5, 1, 4, 3, 1, 4...
## $ v165 <int> 2, 1, 3, 2, 1, 1, 1, 1, 2, 2, 5, 2, 1, 5, 2, 3, 3, 2, 4...
## $ v166 <int> 2, 4, 5, 2, 1, 1, 5, 2, 5, 4, 5, 1, 2, 4, 2, 4, 5, 3, 4...
## $ v167 <int> 2, 4, 5, 3, 4, 4, 2, 4, 5, 2, 4, 4, 2, 5, 2, 4, 3, 2, 4...
## $ v168 <int> 5, 5, 5, 4, 5, 5, 5, 5, 4, 5, 5, 4, 5, 5, 3, 4, 3, 4, 5...
## $ v169 <int> 1, 1, 1, 2, 1, 1, 5, 1, 4, 4, 5, 1, 1, 1, 1, 1, 1, 1, 1...
## $ v170 <int> 5, 1, 4, 3, 2, 4, 4, 1, 2, 4, 3, 3, 3, 5, 4, 3, 5, 3, 4...
## $ v171 <int> 5, 5, 5, 4, 1, 2, 5, 5, 5, 5, 5, 1, 5, 5, 3, 4, 5, 4, 5...
## $ v172 <int> 2, 1, 5, 5, 2, 2, 5, 1, 5, 3, 5, 1, 5, 5, 2, 2, 3, 5, 2...
## $ v173 <int> 2, 2, 4, 2, 2, 1, 4, 1, 1, 4, 4, 1, 2, 5, 4, 4, 3, 1, 4...
## $ v174 <int> 2, 4, 2, 3, 4, 1, 5, 5, 4, 2, 4, 5, 3, 4, 2, 4, 3, 3, 4...
## $ v175 <int> 1, 1, 4, 2, 2, 1, 2, 1, 5, 4, 3, 1, 2, 4, 1, 4, 3, 1, 1...
## $ v176 <int> 4, 4, 4, 3, 1, 4, 4, 3, 3, 2, 5, 5, 3, 5, 3, 1, 3, 3, 2...
## $ v177 <int> 4, 4, 5, 3, 4, 2, 4, 4, 4, 4, 5, 2, 5, 5, 3, 2, 5, 4, 4...
## $ v178 <int> 4, 1, 4, 2, 1, 1, 2, 1, 4, 4, 4, 1, 2, 4, 1, 2, 3, 1, 2...
## $ v179 <int> 4, 4, 4, 3, 4, 2, 4, 4, 5, 4, 5, 2, 5, 5, 3, 1, 5, 3, 4...
## $ v180 <int> 4, 2, 5, 3, 3, 1, 1, 1, 1, 2, 4, 2, 2, 5, 1, 1, 3, 3, 2...
# Plot the histogram
ggplot(sisters67, aes(x = age)) +
geom_histogram(binwidth = 10)
# Tidy the data set
tidy_sisters <- sisters67 %>%
select(-sister) %>%
gather(key, value, -age)
# Print the structure of tidy_sisters
glimpse(tidy_sisters)
## Observations: 1,253,070
## Variables: 3
## $ age <int> 40, 30, 40, 30, 40, 30, 70, 30, 60, 80, 90, 40, 60, 80, ...
## $ key <chr> "v116", "v116", "v116", "v116", "v116", "v116", "v116", ...
## $ value <int> 5, 4, 2, 4, 4, 2, 4, 4, 4, 5, 2, 5, 4, 4, 3, 4, 5, 3, 4,...
# Overall agreement with all questions varied by age
tidy_sisters %>%
group_by(age) %>%
summarize(value = mean(value, na.rm = TRUE))
## # A tibble: 9 x 2
## age value
## <int> <dbl>
## 1 20 2.82
## 2 30 2.81
## 3 40 2.82
## 4 50 2.95
## 5 60 3.10
## 6 70 3.25
## 7 80 3.39
## 8 90 3.55
## 9 100 3.93
# Number of respondents agreed or disagreed overall
tidy_sisters %>%
count(value)
## # A tibble: 5 x 2
## value n
## <int> <int>
## 1 1 326386
## 2 2 211534
## 3 3 160961
## 4 4 277062
## 5 5 277127
# Visualize agreement with age
tidy_sisters %>%
filter(key %in% paste0("v", 153:170)) %>%
group_by(key, value) %>%
summarize(age = mean(age, na.rm = TRUE)) %>%
ggplot(aes(value, age, color = key)) +
geom_line(show.legend = FALSE) +
facet_wrap(~key, nrow = 3)
# Remove the sister column
sisters_select <- sisters67 %>%
select(-sister)
# Build a simple linear regression model
simple_lm <- lm(age ~ .,
data = sisters_select)
# Print the summary of the model
summary(simple_lm)
##
## Call:
## lm(formula = age ~ ., data = sisters_select)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.663 -9.586 -1.207 8.991 53.286
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.59542 1.07173 25.748 < 2e-16 ***
## v116 -0.69014 0.07727 -8.931 < 2e-16 ***
## v117 -0.15914 0.08869 -1.794 0.072786 .
## v118 -0.74668 0.08473 -8.813 < 2e-16 ***
## v119 -0.35314 0.08321 -4.244 2.21e-05 ***
## v120 -0.13875 0.07513 -1.847 0.064813 .
## v121 0.04265 0.07794 0.547 0.584247
## v122 0.05237 0.08086 0.648 0.517208
## v123 -0.96372 0.09061 -10.636 < 2e-16 ***
## v124 0.44543 0.08681 5.131 2.91e-07 ***
## v125 0.50420 0.07425 6.791 1.15e-11 ***
## v126 0.44358 0.08579 5.170 2.36e-07 ***
## v127 -0.04781 0.07915 -0.604 0.545810
## v128 0.04459 0.07595 0.587 0.557162
## v129 0.03044 0.07881 0.386 0.699351
## v130 0.51028 0.08064 6.328 2.54e-10 ***
## v131 -0.54431 0.08417 -6.467 1.02e-10 ***
## v132 -0.02527 0.09337 -0.271 0.786703
## v133 -0.67041 0.07563 -8.864 < 2e-16 ***
## v134 -0.12144 0.09060 -1.340 0.180130
## v135 0.45773 0.10886 4.205 2.63e-05 ***
## v136 -0.08790 0.07438 -1.182 0.237293
## v137 0.74412 0.10230 7.274 3.63e-13 ***
## v138 0.31534 0.10601 2.974 0.002939 **
## v139 1.36585 0.10514 12.990 < 2e-16 ***
## v140 -0.73675 0.07371 -9.995 < 2e-16 ***
## v141 0.50515 0.09355 5.400 6.75e-08 ***
## v142 -0.22168 0.08357 -2.653 0.007992 **
## v143 0.08320 0.08375 0.993 0.320536
## v144 1.09413 0.10870 10.066 < 2e-16 ***
## v145 -0.46821 0.08217 -5.698 1.23e-08 ***
## v146 -0.50063 0.08094 -6.185 6.32e-10 ***
## v147 -0.28499 0.09800 -2.908 0.003640 **
## v148 1.47288 0.09165 16.070 < 2e-16 ***
## v149 -0.29683 0.08562 -3.467 0.000528 ***
## v150 -0.33882 0.08396 -4.036 5.46e-05 ***
## v151 0.79497 0.08901 8.931 < 2e-16 ***
## v152 -0.02073 0.08179 -0.253 0.799906
## v153 -0.53982 0.09110 -5.925 3.17e-09 ***
## v154 0.98930 0.07843 12.614 < 2e-16 ***
## v155 0.96066 0.09897 9.706 < 2e-16 ***
## v156 1.07836 0.09176 11.752 < 2e-16 ***
## v157 0.07577 0.08249 0.918 0.358378
## v158 0.05330 0.08419 0.633 0.526696
## v159 -0.28846 0.08321 -3.467 0.000528 ***
## v160 0.28066 0.08559 3.279 0.001043 **
## v161 0.67235 0.08759 7.677 1.71e-14 ***
## v162 -0.29388 0.10063 -2.920 0.003501 **
## v163 -1.38883 0.09242 -15.027 < 2e-16 ***
## v164 -0.44411 0.07017 -6.329 2.52e-10 ***
## v165 -0.49356 0.09033 -5.464 4.71e-08 ***
## v166 0.24787 0.08329 2.976 0.002924 **
## v167 -0.06290 0.08185 -0.768 0.442236
## v168 0.33712 0.09425 3.577 0.000349 ***
## v169 1.44938 0.08634 16.786 < 2e-16 ***
## v170 1.01626 0.09083 11.189 < 2e-16 ***
## v171 0.90086 0.08359 10.777 < 2e-16 ***
## v172 0.07702 0.07176 1.073 0.283135
## v173 0.76461 0.06936 11.025 < 2e-16 ***
## v174 0.22074 0.07851 2.812 0.004934 **
## v175 0.18369 0.07930 2.316 0.020553 *
## v176 1.03334 0.08996 11.487 < 2e-16 ***
## v177 -0.07908 0.09643 -0.820 0.412153
## v178 -0.08005 0.08250 -0.970 0.331906
## v179 0.29778 0.09251 3.219 0.001289 **
## v180 0.11524 0.08566 1.345 0.178538
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.13 on 19212 degrees of freedom
## Multiple R-squared: 0.3332, Adjusted R-squared: 0.3309
## F-statistic: 147.7 on 65 and 19212 DF, p-value: < 2.2e-16
# Split the data into training and validation/test sets
set.seed(1234)
in_train <- caret::createDataPartition(sisters_select$age, p = 0.6, list = FALSE)
training <- sisters_select[in_train, ]
validation_test <- sisters_select[-in_train, ]
# Split the validation and test sets
set.seed(1234)
in_test <- caret::createDataPartition(validation_test$age, p = 0.5, list = FALSE)
testing <- validation_test[in_test, ]
validation <- validation_test[-in_test, ]
# Fit a CART model
sisters_cart <- caret::train(age ~ ., method = "rpart", data = training)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
# Print the CART model
sisters_cart
## CART
##
## 11569 samples
## 65 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 11569, 11569, 11569, 11569, 11569, 11569, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.02304336 14.61359 0.1724244 12.00686
## 0.04935303 14.89119 0.1403800 12.41303
## 0.11481230 15.54485 0.1046127 13.19914
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.02304336.
inSmall <- sample(1:nrow(training), 500, replace=FALSE)
smallSisters <- training[sort(inSmall), ]
sisters_xgb <- caret::train(age ~ ., method = "xgbTree", data = smallSisters)
sisters_gbm <- caret::train(age ~ ., method = "gbm", data = smallSisters, verbose=FALSE)
# Make predictions on the three models
modeling_results <- validation %>%
mutate(CART = predict(sisters_cart, validation),
XGB = predict(sisters_xgb, validation),
GBM = predict(sisters_gbm, validation))
# View the predictions
modeling_results %>%
select(CART, XGB, GBM)
## # A tibble: 3,854 x 3
## CART XGB GBM
## <dbl> <dbl> <dbl>
## 1 49.5 46.2 44.3
## 2 49.5 61.1 56.5
## 3 58.0 59.9 65.6
## 4 58.0 60.0 61.9
## 5 58.0 71.6 74.6
## 6 49.5 50.9 53.4
## 7 49.5 58.6 55.0
## 8 49.5 42.2 38.0
## 9 41.3 41.7 38.2
## 10 58.0 51.6 50.0
## # ... with 3,844 more rows
# Compare performace
yardstick::metrics(modeling_results, truth = age, estimate = CART)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 14.6 0.163
yardstick::metrics(modeling_results, truth = age, estimate = XGB)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 13.5 0.287
yardstick::metrics(modeling_results, truth = age, estimate = GBM)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 13.6 0.286
# Calculate RMSE
testing %>%
mutate(prediction = predict(sisters_gbm, testing)) %>%
yardstick::rmse(truth = age, estimate = prediction)
## [1] 13.87981
Chapter 1 - Introduction to Process Analysis
Introduction and overview:
Activities as cornerstones of processes:
Components of process data:
Example code includes:
# Load the processmapR package using library
library(processmapR)
##
## Attaching package: 'processmapR'
## The following object is masked from 'package:stats':
##
## frequency
library(bupaR)
## Loading required package: edeaR
## Loading required package: eventdataR
## Loading required package: xesreadR
## Loading required package: processmonitR
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'processmonitR'
## Loading required package: petrinetR
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'petrinetR'
##
## Attaching package: 'bupaR'
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:utils':
##
## timestamp
handling <- c('Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'X-Ray', 'X-Ray', 'X-Ray', 'X-Ray', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'X-Ray', 'X-Ray', 'X-Ray', 'X-Ray', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out')
patient <- c('43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '156', '170', '172', '184', '278', '348', '420', '43', '156', '170', '172', '184', '278', '348', '420', '155', '221', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '156', '170', '172', '184', '278', '348', '420', '43', '156', '170', '172', '184', '278', '348', '420', '155', '221', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493')
employee <- c('r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r5', 'r5', 'r5', 'r5', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r5', 'r5', 'r5', 'r5', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7')
handling_id <- c('43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '543', '655', '656', '670', '672', '684', '721', '778', '848', '920', '955', '993', '1020', '1072', '1081', '1082', '1088', '1127', '1163', '1199', '1257', '1309', '1318', '1319', '1325', '1364', '1400', '1436', '1557', '1587', '1710', '1730', '1777', '1889', '1890', '1904', '1906', '1918', '1955', '2012', '2082', '2154', '2189', '2227', '2272', '2384', '2385', '2399', '2401', '2413', '2450', '2507', '2577', '2649', '2684', '2720', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '543', '655', '656', '670', '672', '684', '721', '778', '848', '920', '955', '993', '1020', '1072', '1081', '1082', '1088', '1127', '1163', '1199', '1257', '1309', '1318', '1319', '1325', '1364', '1400', '1436', '1557', '1587', '1710', '1730', '1777', '1889', '1890', '1904', '1906', '1918', '1955', '2012', '2082', '2154', '2189', '2227', '2272', '2384', '2385', '2399', '2401', '2413', '2450', '2507', '2577', '2649', '2684', '2720')
registration_type <- c('start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete')
rTime <- c('2017-02-19 04:38:51', '2017-06-03 10:05:28', '2017-06-03 10:05:28', '2017-06-17 15:10:30', '2017-06-17 23:00:33', '2017-06-27 07:48:22', '2017-08-03 17:05:27', '2017-09-26 20:22:49', '2017-11-24 08:28:44', '2018-02-08 03:39:21', '2018-03-14 21:04:28', '2018-04-29 04:55:10', '2017-02-19 07:28:53', '2017-06-04 06:27:00', '2017-06-03 13:23:14', '2017-06-17 16:31:58', '2017-06-18 18:29:13', '2017-06-28 00:14:50', '2017-08-04 07:22:06', '2017-09-27 22:57:03', '2017-11-24 10:33:00', '2018-02-08 17:33:12', '2018-03-15 15:12:41', '2018-04-30 19:40:22', '2017-02-20 19:59:18', '2017-06-04 15:18:50', '2017-06-18 22:51:07', '2017-06-21 02:43:27', '2017-07-01 23:55:10', '2017-09-28 22:58:23', '2017-11-25 12:06:18', '2018-02-12 09:01:38', '2017-02-21 06:49:49', '2017-06-04 23:23:28', '2017-06-19 06:44:30', '2017-06-21 11:16:30', '2017-07-02 11:16:08', '2017-09-29 07:28:10', '2017-11-25 21:54:56', '2018-02-12 19:43:42', '2017-06-05 00:12:24', '2017-08-05 08:25:17', '2018-03-17 10:30:24', '2018-05-02 07:32:45', '2017-02-21 14:50:43', '2017-06-05 14:03:19', '2017-06-05 10:26:16', '2017-06-19 22:46:10', '2017-06-22 04:39:35', '2017-07-03 01:28:49', '2017-08-05 22:06:23', '2017-09-29 19:13:51', '2017-11-26 06:52:23', '2018-02-17 02:44:58', '2018-03-18 00:20:51', '2018-05-02 18:14:11', '2017-02-24 14:58:43', '2017-06-05 15:58:53', '2017-06-05 15:58:53', '2017-06-20 03:48:37', '2017-06-22 08:40:55', '2017-07-03 03:39:51', '2017-08-08 23:17:45', '2017-09-29 21:16:01', '2017-11-27 04:56:53', '2018-02-20 09:49:29', '2018-03-18 08:12:07', '2018-05-03 00:11:10', '2017-02-19 07:28:53', '2017-06-03 14:19:00', '2017-06-03 13:23:14', '2017-06-17 16:31:58', '2017-06-18 01:07:42', '2017-06-27 12:22:51', '2017-08-03 19:25:12', '2017-09-26 22:17:18', '2017-11-24 10:33:00', '2018-02-08 06:01:38', '2018-03-15 00:34:01', '2018-04-29 07:39:14', '2017-02-19 21:58:08', '2017-06-04 14:23:26', '2017-06-04 06:27:00', '2017-06-18 04:14:55', '2017-06-19 00:40:19', '2017-06-28 12:48:20', '2017-08-04 21:09:17', '2017-09-28 12:00:12', '2017-11-25 00:44:30', '2018-02-09 07:05:52', '2018-03-16 04:08:03', '2018-05-01 10:37:51', '2017-02-21 03:12:26', '2017-06-04 19:35:51', '2017-06-19 03:01:11', '2017-06-21 08:02:20', '2017-07-02 07:43:48', '2017-09-29 04:58:49', '2017-11-25 18:30:43', '2018-02-12 13:57:13', '2017-02-21 09:57:05', '2017-06-05 02:46:59', '2017-06-19 11:40:53', '2017-06-21 16:09:26', '2017-07-02 16:03:16', '2017-09-29 12:44:39', '2017-11-26 02:40:30', '2018-02-12 23:53:46', '2017-06-05 04:39:38', '2017-08-05 13:56:39', '2018-03-17 14:09:40', '2018-05-02 12:24:41', '2017-02-21 17:57:58', '2017-06-05 15:58:53', '2017-06-05 14:03:19', '2017-06-20 01:44:29', '2017-06-22 08:40:55', '2017-07-03 03:39:51', '2017-08-05 23:53:27', '2017-09-29 21:16:01', '2017-11-26 09:44:37', '2018-02-17 06:17:57', '2018-03-18 03:22:17', '2018-05-02 21:17:12', '2017-02-24 16:03:49', '2017-06-05 17:22:16', '2017-06-05 17:15:30', '2017-06-20 05:36:40', '2017-06-22 10:59:58', '2017-07-03 05:00:48', '2017-08-09 00:13:39', '2017-09-29 23:42:48', '2017-11-27 06:53:23', '2018-02-20 12:04:00', '2018-03-18 10:48:34', '2018-05-03 02:11:42')
rOrder <- c(43, 155, 156, 170, 172, 184, 221, 278, 348, 420, 455, 493, 543, 655, 656, 670, 672, 684, 721, 778, 848, 920, 955, 993, 1020, 1072, 1081, 1082, 1088, 1127, 1163, 1199, 1257, 1309, 1318, 1319, 1325, 1364, 1400, 1436, 1557, 1587, 1710, 1730, 1777, 1889, 1890, 1904, 1906, 1918, 1955, 2012, 2082, 2154, 2189, 2227, 2272, 2384, 2385, 2399, 2401, 2413, 2450, 2507, 2577, 2649, 2684, 2720, 2764, 2876, 2877, 2891, 2893, 2905, 2942, 2999, 3069, 3141, 3176, 3214, 3264, 3376, 3377, 3391, 3393, 3405, 3442, 3499, 3569, 3641, 3676, 3714, 3741, 3793, 3802, 3803, 3809, 3848, 3884, 3920, 3978, 4030, 4039, 4040, 4046, 4085, 4121, 4157, 4278, 4308, 4431, 4451, 4498, 4610, 4611, 4625, 4627, 4639, 4676, 4733, 4803, 4875, 4910, 4948, 4993, 5105, 5106, 5120, 5122, 5134, 5171, 5228, 5298, 5370, 5405, 5441)
pFrame <- tibble(handling=factor(handling, levels=c('Blood test', 'Check-out', 'Discuss Results', 'MRI SCAN', 'Registration', 'Triage and Assessment', 'X-Ray')),
patient=patient,
employee=factor(employee, levels=c('r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7')),
handling_id=handling_id,
registration_type=factor(registration_type, levels=c("complete", "start")),
time=as.POSIXct(rTime),
.order=rOrder
)
patients <- eventlog(pFrame,
case_id = "patient",
activity_id = "handling",
activity_instance_id = "handling_id",
lifecycle_id = "registration_type",
timestamp = "time",
resource_id = "employee")
# The function slice can be used to take a slice of cases out of the eventdata. slice(1:10) will select the first ten cases in the event log, where first is defined by the current ordering of the data.
# How many patients are there?
n_cases(patients)
## [1] 12
# Print the summary of the data
summary(patients)
## Number of events: 136
## Number of cases: 12
## Number of traces: 2
## Number of distinct activities: 7
## Average trace length: 11.33333
##
## Start eventlog: 2017-02-19 04:38:51
## End eventlog: 2018-05-03 02:11:42
## handling patient employee handling_id
## Blood test :16 Length:136 r1:24 Length:136
## Check-out :24 Class :character r2:24 Class :character
## Discuss Results :24 Mode :character r3:16 Mode :character
## MRI SCAN :16 r4:16
## Registration :24 r5: 8
## Triage and Assessment:24 r6:24
## X-Ray : 8 r7:24
## registration_type time .order
## complete:68 Min. :2017-02-19 04:38:51 Min. : 1.00
## start :68 1st Qu.:2017-06-14 15:43:26 1st Qu.: 34.75
## Median :2017-07-03 03:39:51 Median : 68.50
## Mean :2017-09-06 11:31:32 Mean : 68.50
## 3rd Qu.:2017-11-26 14:32:41 3rd Qu.:102.25
## Max. :2018-05-03 02:11:42 Max. :136.00
##
# Show the journey of the first patient
slice(patients, 1)
## Log of 12 events consisting of:
## 1 trace
## 1 case
## 6 instances of 6 activities
## 6 resources
## Events occurred from 2017-02-19 04:38:51 until 2017-02-24 16:03:49
##
## Variables were mapped as follows:
## Case identifier: patient
## Activity identifier: handling
## Resource identifier: employee
## Activity instance identifier: handling_id
## Timestamp: time
## Lifecycle transition: registration_type
##
## # A tibble: 12 x 7
## handling patient employee handling_id registration_ty~ time
## <fct> <chr> <fct> <chr> <fct> <dttm>
## 1 Registr~ 43 r1 43 start 2017-02-19 04:38:51
## 2 Triage ~ 43 r2 543 start 2017-02-19 07:28:53
## 3 Blood t~ 43 r3 1020 start 2017-02-20 19:59:18
## 4 MRI SCAN 43 r4 1257 start 2017-02-21 06:49:49
## 5 Discuss~ 43 r6 1777 start 2017-02-21 14:50:43
## 6 Check-o~ 43 r7 2272 start 2017-02-24 14:58:43
## 7 Registr~ 43 r1 43 complete 2017-02-19 07:28:53
## 8 Triage ~ 43 r2 543 complete 2017-02-19 21:58:08
## 9 Blood t~ 43 r3 1020 complete 2017-02-21 03:12:26
## 10 MRI SCAN 43 r4 1257 complete 2017-02-21 09:57:05
## 11 Discuss~ 43 r6 1777 complete 2017-02-21 17:57:58
## 12 Check-o~ 43 r7 2272 complete 2017-02-24 16:03:49
## # ... with 1 more variable: .order <int>
# How many distinct activities are there?
n_activities(patients)
## [1] 7
# What are the names of the activities?
activity_labels(patients)
## [1] Registration Triage and Assessment Blood test
## [4] MRI SCAN X-Ray Discuss Results
## [7] Check-out
## 7 Levels: Blood test Check-out Discuss Results MRI SCAN ... X-Ray
# Create a list of activities
activities(patients)
## # A tibble: 7 x 3
## handling absolute_frequency relative_frequency
## <fct> <int> <dbl>
## 1 Check-out 12 0.176
## 2 Discuss Results 12 0.176
## 3 Registration 12 0.176
## 4 Triage and Assessment 12 0.176
## 5 Blood test 8 0.118
## 6 MRI SCAN 8 0.118
## 7 X-Ray 4 0.0588
# Have a look at the different traces
traces(patients)
## # A tibble: 2 x 3
## trace absolute_frequen~ relative_frequen~
## <chr> <int> <dbl>
## 1 Registration,Triage and Assessment,Blood ~ 8 0.667
## 2 Registration,Triage and Assessment,X-Ray,~ 4 0.333
# How many are there?
n_traces(patients)
## [1] 2
# Visualize the traces using trace_explorer
trace_explorer(patients, coverage=1)
# Draw process map
process_map(patients)
claims <- tibble(id=c("claim1", "claim1", "claim2", "claim2", "claim2"),
action=c(10002L, 10011L, 10015L, 10024L, 10024L),
action_type=c("Check Contract", "Pay Back Decision", "Check Contract", "Pay Back Decision", "Pay Back Decision"),
date=as.Date(c("2008-01-12", "2008-03-22", "2008-01-13", "2008-03-23", "2008-04-14")),
originator=c("Assistant 1", "Manager 2", "Assistant 6", "Manager 2", "Manager 2"),
status=as.factor(c("start", "start", "start", "start", "complete"))
)
claims
## # A tibble: 5 x 6
## id action action_type date originator status
## <chr> <int> <chr> <date> <chr> <fct>
## 1 claim1 10002 Check Contract 2008-01-12 Assistant 1 start
## 2 claim1 10011 Pay Back Decision 2008-03-22 Manager 2 start
## 3 claim2 10015 Check Contract 2008-01-13 Assistant 6 start
## 4 claim2 10024 Pay Back Decision 2008-03-23 Manager 2 start
## 5 claim2 10024 Pay Back Decision 2008-04-14 Manager 2 complete
#create eventlog claims_log
claims_log <- eventlog(claims,
case_id = "id",
activity_id = "action_type",
activity_instance_id = "action",
lifecycle_id = "status",
timestamp = "date",
resource_id = "originator")
# Print summary
summary(claims_log)
## Number of events: 5
## Number of cases: 2
## Number of traces: 1
## Number of distinct activities: 2
## Average trace length: 2.5
##
## Start eventlog: 2008-01-12
## End eventlog: 2008-04-14
## id action action_type
## Length:5 Length:5 Check Contract :2
## Class :character Class :character Pay Back Decision:3
## Mode :character Mode :character
##
##
##
## date originator status .order
## Min. :2008-01-12 Assistant 1:1 complete:1 Min. :1
## 1st Qu.:2008-01-13 Assistant 6:1 start :4 1st Qu.:2
## Median :2008-03-22 Manager 2 :3 Median :3
## Mean :2008-02-28 Mean :3
## 3rd Qu.:2008-03-23 3rd Qu.:4
## Max. :2008-04-14 Max. :5
# Check activity labels
activity_labels(claims_log)
## [1] Check Contract Pay Back Decision
## Levels: Check Contract Pay Back Decision
# Once you have an eventlog, you can access its complete metadata using the function mapping or the functions case_id, activity_id etc., to inspect individual identifiers.
Chapter 2 - Analysis Techniques
Organizational analysis:
Structuredness:
Performance analysis:
Linking perspectives:
Example code includes:
data(sepsis, package="eventdataR")
str(sepsis)
## Classes 'eventlog', 'tbl_df', 'tbl' and 'data.frame': 15214 obs. of 34 variables:
## $ case_id : chr "A" "A" "A" "A" ...
## $ activity : Factor w/ 16 levels "Admission IC",..: 4 10 3 9 6 5 8 7 2 3 ...
## $ lifecycle : Factor w/ 1 level "complete": 1 1 1 1 1 1 1 1 1 1 ...
## $ resource : Factor w/ 26 levels "?","A","B","C",..: 2 3 3 3 4 2 2 2 5 3 ...
## $ timestamp : POSIXct, format: "2014-10-22 11:15:41" "2014-10-22 11:27:00" ...
## $ age : int 85 NA NA NA NA NA NA NA NA NA ...
## $ crp : num NA NA 210 NA NA NA NA NA NA 1090 ...
## $ diagnose : chr "A" NA NA NA ...
## $ diagnosticartastrup : chr "true" NA NA NA ...
## $ diagnosticblood : chr "true" NA NA NA ...
## $ diagnosticecg : chr "true" NA NA NA ...
## $ diagnosticic : chr "true" NA NA NA ...
## $ diagnosticlacticacid : chr "true" NA NA NA ...
## $ diagnosticliquor : chr "false" NA NA NA ...
## $ diagnosticother : chr "false" NA NA NA ...
## $ diagnosticsputum : chr "false" NA NA NA ...
## $ diagnosticurinaryculture : chr "true" NA NA NA ...
## $ diagnosticurinarysediment: chr "true" NA NA NA ...
## $ diagnosticxthorax : chr "true" NA NA NA ...
## $ disfuncorg : chr "true" NA NA NA ...
## $ hypotensie : chr "true" NA NA NA ...
## $ hypoxie : chr "false" NA NA NA ...
## $ infectionsuspected : chr "true" NA NA NA ...
## $ infusion : chr "true" NA NA NA ...
## $ lacticacid : chr NA NA NA "2.2" ...
## $ leucocytes : chr NA "9.6" NA NA ...
## $ oligurie : chr "false" NA NA NA ...
## $ sirscritheartrate : chr "true" NA NA NA ...
## $ sirscritleucos : chr "false" NA NA NA ...
## $ sirscrittachypnea : chr "true" NA NA NA ...
## $ sirscrittemperature : chr "true" NA NA NA ...
## $ sirscriteria2ormore : chr "true" NA NA NA ...
## $ activity_instance_id : chr "1" "2" "3" "4" ...
## $ .order : int 1 2 3 4 5 6 7 8 9 10 ...
## - attr(*, "case_id")= chr "case_id"
## - attr(*, "activity_id")= chr "activity"
## - attr(*, "activity_instance_id")= chr "activity_instance_id"
## - attr(*, "lifecycle_id")= chr "lifecycle"
## - attr(*, "resource_id")= chr "resource"
## - attr(*, "timestamp")= chr "timestamp"
# Print list of resources
resource_frequency(sepsis, level="resource")
## # A tibble: 26 x 3
## resource absolute relative
## <fct> <int> <dbl>
## 1 B 8111 0.533
## 2 A 3462 0.228
## 3 C 1053 0.0692
## 4 E 782 0.0514
## 5 ? 294 0.0193
## 6 F 216 0.0142
## 7 L 213 0.0140
## 8 O 186 0.0122
## 9 G 148 0.00973
## 10 I 126 0.00828
## # ... with 16 more rows
# Number of resources per activity
resource_frequency(sepsis, level = "activity")
## # A tibble: 16 x 11
## activity nr_of_resources min q1 mean median q3 max st_dev iqr
## <fct> <int> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
## 1 Admissi~ 4 1 7 29.2 31 53.2 54 28.2 46.2
## 2 Admissi~ 20 1 17 59.1 40.5 68.2 216 62.7 51.2
## 3 CRP 1 3262 3262 3262 3262 3262 3262 NA 0
## 4 ER Regi~ 2 65 295 525 525 755 985 651. 460
## 5 ER Seps~ 2 65 295. 524. 524. 754. 984 650. 460.
## 6 ER Tria~ 1 1053 1053 1053 1053 1053 1053 NA 0
## 7 IV Anti~ 2 45 228. 412. 412. 595. 778 518. 366.
## 8 IV Liqu~ 2 38 207. 376. 376. 546. 715 479. 338.
## 9 LacticA~ 1 1466 1466 1466 1466 1466 1466 NA 0
## 10 Leucocy~ 1 3383 3383 3383 3383 3383 3383 NA 0
## 11 Release~ 1 671 671 671 671 671 671 NA 0
## 12 Release~ 1 56 56 56 56 56 56 NA 0
## 13 Release~ 1 25 25 25 25 25 25 NA 0
## 14 Release~ 1 24 24 24 24 24 24 NA 0
## 15 Release~ 1 6 6 6 6 6 6 NA 0
## 16 Return ~ 1 294 294 294 294 294 294 NA 0
## # ... with 1 more variable: total <int>
# Plot Number of executions per resource-activity (not working in R 3.5.3)
# resource_frequency(sepsis, level = "resource-activity") %>% plot
# Calculate resource involvement
resource_involvement(sepsis, level="resource")
## # A tibble: 26 x 3
## resource absolute relative
## <fct> <int> <dbl>
## 1 C 1050 1
## 2 B 1013 0.965
## 3 A 985 0.938
## 4 E 782 0.745
## 5 ? 294 0.28
## 6 F 200 0.190
## 7 O 179 0.170
## 8 G 147 0.14
## 9 I 118 0.112
## 10 M 82 0.0781
## # ... with 16 more rows
# Show graphically
sepsis %>% resource_involvement(level = "resource") %>% plot
# Compare with resource frequency
resource_frequency(sepsis, level="resource")
## # A tibble: 26 x 3
## resource absolute relative
## <fct> <int> <dbl>
## 1 B 8111 0.533
## 2 A 3462 0.228
## 3 C 1053 0.0692
## 4 E 782 0.0514
## 5 ? 294 0.0193
## 6 F 216 0.0142
## 7 L 213 0.0140
## 8 O 186 0.0122
## 9 G 148 0.00973
## 10 I 126 0.00828
## # ... with 16 more rows
# Min, max and average number of repetitions
sepsis %>% number_of_repetitions(level = "log")
## Using default type: all
## min q1 median mean q3 max st_dev iqr
## 0.000000 0.000000 2.000000 1.640000 3.000000 5.000000 1.280461 3.000000
## attr(,"type")
## [1] "all"
# Plot repetitions per activity
sepsis %>% number_of_repetitions(level = "activity") %>% plot
## Using default type: all
# Number of repetitions per resources
sepsis %>% number_of_repetitions(level = "resource")
## Using default type: all
## # Description: df[,3] [26 x 3]
## first_resource absolute relative
## <fct> <dbl> <dbl>
## 1 B 1536 0.189
## 2 G 67 0.453
## 3 F 16 0.0741
## 4 R 13 0.228
## 5 I 12 0.0952
## 6 Q 11 0.175
## 7 O 9 0.0484
## 8 J 8 0.308
## 9 T 8 0.229
## 10 K 7 0.389
## # ... with 16 more rows
eci <- c('21', '21', '21', '21', '21', '21', '21', '21', '21', '31', '31', '31', '31', '31', '31', '31', '31', '31', '31', '41', '41', '41', '41', '41', '41', '41', '51', '51', '51', '51', '51', '51', '51', '61', '61', '61', '61', '61', '61', '91', '91', '91', '91', '91', '91', '101', '101', '101', '101', '101', '101', '111', '111', '111', '111', '121', '121', '121', '121', '121', '121', '121', '121', '121', '131', '131', '131', '131', '131', '131', '131', '131', '161', '161', '171', '171', '171', '171', '181', '181', '181', '181', '181', '181', '201', '201', '201', '201', '201', '201', '201', '12', '12', '12', '12', '12', '22', '22', '22', '22', '22', '22', '32', '32', '32', '32', '32', '32', '42', '42', '42', '42', '52', '52', '52', '52', '52', '82', '82', '82', '82', '82', '92', '92', '92', '92', '92', '102', '102', '102', '102', '102', '112', '112', '122', '122', '21', '21', '21', '21', '21', '21', '21', '21', '21', '31', '31', '31', '31', '31', '31', '31', '31', '31', '31', '41', '41', '41', '41', '41', '41', '41', '51', '51', '51', '51', '51', '51', '51', '61', '61', '61', '61', '61', '61', '91', '91', '91', '91', '91', '91', '101', '101', '101', '101', '101', '101', '111', '111', '111', '111', '121', '121', '121', '121', '121', '121', '121', '121', '121', '131', '131', '131', '131', '131', '131', '131', '131', '161', '161', '171', '171', '171', '171', '181', '181', '181', '181', '181', '181', '201', '201', '201', '201', '201', '201', '201', '12', '12', '12', '12', '12', '22', '22', '22', '22', '22', '22', '32', '32', '32', '32', '32', '32', '42', '42', '42', '42', '52', '52', '52', '52', '52', '82', '82', '82', '82', '82', '92', '92', '92', '92', '92', '102', '102', '102', '102', '102', '112', '112', '122', '122')
ea1 <- c('prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareBreakfast', 'eatingBreakfast', 'prepareDinner', 'eatingDinner', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareDinner', 'eatingDinner', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'prepareBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'snack', 'eatingBreakfast', 'prepareBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'eatingBreakfast', 'prepareBreakfast', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'snack', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'eatingLunch', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareBreakfast')
ea2 <- c('eatingBreakfast', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareBreakfast', 'eatingBreakfast', 'prepareDinner', 'eatingDinner', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareDinner', 'eatingDinner', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'prepareBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'snack', 'eatingBreakfast', 'prepareBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'eatingBreakfast', 'prepareBreakfast', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'snack', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'eatingLunch', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareBreakfast', 'eatingBreakfast')
eaii <- c('9', '10', '19', '23', '24', '26', '36', '40', '41', '51', '52', '58', '60', '62', '63', '67', '69', '72', '73', '86', '87', '89', '90', '104', '105', '107', '119', '120', '128', '132', '133', '138', '139', '149', '150', '156', '159', '160', '164', '174', '175', '192', '194', '195', '198', '205', '206', '208', '211', '213', '214', '229', '236', '237', '239', '245', '251', '252', '253', '255', '259', '260', '262', '264', '271', '276', '281', '287', '292', '293', '297', '299', '310', '312', '331', '332', '336', '347', '363', '364', '374', '376', '387', '389', '434', '435', '447', '448', '450', '453', '454', '462', '463', '471', '472', '475', '483', '484', '487', '491', '492', '496', '508', '509', '512', '517', '518', '522', '536', '540', '541', '543', '562', '563', '565', '566', '572', '584', '585', '589', '590', '598', '615', '616', '618', '619', '627', '639', '640', '642', '643', '653', '665', '666', '682', '683', '9', '10', '19', '23', '24', '26', '36', '40', '41', '51', '52', '58', '60', '62', '63', '67', '69', '72', '73', '86', '87', '89', '90', '104', '105', '107', '119', '120', '128', '132', '133', '138', '139', '149', '150', '156', '159', '160', '164', '174', '175', '192', '194', '195', '198', '205', '206', '208', '211', '213', '214', '229', '236', '237', '239', '245', '251', '252', '253', '255', '259', '260', '262', '264', '271', '276', '281', '287', '292', '293', '297', '299', '310', '312', '331', '332', '336', '347', '363', '364', '374', '376', '387', '389', '434', '435', '447', '448', '450', '453', '454', '462', '463', '471', '472', '475', '483', '484', '487', '491', '492', '496', '508', '509', '512', '517', '518', '522', '536', '540', '541', '543', '562', '563', '565', '566', '572', '584', '585', '589', '590', '598', '615', '616', '618', '619', '627', '639', '640', '642', '643', '653', '665', '666', '682', '683')
elci <- c('start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete')
ets1 <- c('2012-11-12 09:42:02', '2012-11-12 09:52:33', '2012-11-12 11:05:44', '2012-11-12 13:45:49', '2012-11-12 13:48:49', '2012-11-12 15:23:00', '2012-11-12 18:47:29', '2012-11-12 22:35:21', '2012-11-12 22:35:21', '2012-11-13 08:56:37', '2012-11-13 09:04:54', '2012-11-13 10:14:04', '2012-11-13 13:47:45', '2012-11-13 14:08:24', '2012-11-13 14:19:01', '2012-11-13 17:34:23', '2012-11-13 18:51:51', '2012-11-13 23:05:07', '2012-11-13 23:17:07', '2012-11-14 09:06:08', '2012-11-14 09:17:48', '2012-11-14 10:38:16', '2012-11-14 10:44:16', '2012-11-14 21:30:09', '2012-11-14 21:37:09', '2012-11-14 22:14:23', '2012-11-15 09:37:15', '2012-11-15 09:47:12', '2012-11-15 10:11:08', '2012-11-15 14:35:27', '2012-11-15 14:41:27', '2012-11-15 22:07:26', '2012-11-15 22:26:02', '2012-11-16 10:39:14', '2012-11-16 10:52:56', '2012-11-16 12:09:10', '2012-11-16 14:13:00', '2012-11-16 14:19:00', '2012-11-16 18:11:36', '2012-11-19 10:13:23', '2012-11-19 10:25:00', '2012-11-19 15:55:22', '2012-11-19 21:47:27', '2012-11-19 21:59:27', '2012-11-19 22:31:06', '2012-11-20 10:20:00', '2012-11-20 10:21:02', '2012-11-20 11:00:16', '2012-11-20 13:03:28', '2012-11-20 14:25:11', '2012-11-20 14:41:22', '2012-11-21 10:01:00', '2012-11-21 15:02:08', '2012-11-21 15:15:08', '2012-11-21 17:50:29', '2012-11-22 01:40:42', '2012-11-22 10:19:15', '2012-11-22 10:26:15', '2012-11-22 11:02:27', '2012-11-22 11:56:06', '2012-11-22 15:05:51', '2012-11-22 15:12:55', '2012-11-22 16:43:08', '2012-11-22 18:15:32', '2012-11-23 00:36:00', '2012-11-23 01:03:00', '2012-11-23 09:49:00', '2012-11-23 12:53:06', '2012-11-23 14:01:08', '2012-11-23 14:23:08', '2012-11-23 16:57:24', '2012-11-23 17:58:00', '2012-11-26 09:06:12', '2012-11-26 09:57:12', '2012-11-27 10:20:26', '2012-11-27 10:30:50')
ets2 <- c('2012-11-27 11:54:15', '2012-11-27 19:46:15', '2012-11-28 09:27:15', '2012-11-28 09:34:15', '2012-11-28 12:28:02', '2012-11-28 13:16:33', '2012-11-28 19:30:08', '2012-11-28 22:15:02', '2012-11-30 10:43:19', '2012-11-30 10:46:19', '2012-11-30 14:51:36', '2012-11-30 15:08:36', '2012-11-30 17:30:40', '2012-11-30 22:12:05', '2012-11-30 22:16:07', '2011-11-28 10:38:00', '2011-11-28 10:43:00', '2011-11-28 14:31:06', '2011-11-28 14:42:00', '2011-11-28 20:20:55', '2011-11-29 12:09:09', '2011-11-29 12:11:01', '2011-11-29 13:25:29', '2011-11-29 15:15:14', '2011-11-29 15:23:00', '2011-11-29 16:32:20', '2011-11-30 10:23:46', '2011-11-30 10:28:46', '2011-11-30 13:05:27', '2011-11-30 14:39:42', '2011-11-30 14:56:00', '2011-11-30 16:41:05', '2011-11-30 14:37:00', '2011-12-01 11:17:05', '2011-12-01 11:20:05', '2011-12-01 14:29:37', '2011-12-02 12:29:08', '2011-12-02 12:32:08', '2011-12-02 14:47:18', '2011-12-02 14:51:00', '2011-12-02 19:40:44', '2011-12-05 12:15:45', '2011-12-05 12:18:05', '2011-12-05 15:00:55', '2011-12-05 15:14:00', '2011-12-05 19:24:11', '2011-12-06 11:30:19', '2011-12-06 11:33:02', '2011-12-06 14:41:16', '2011-12-06 14:56:00', '2011-12-06 19:22:50', '2011-12-07 11:12:17', '2011-12-07 11:17:22', '2011-12-07 14:04:32', '2011-12-07 14:14:00', '2011-12-07 19:23:55', '2011-12-08 11:25:12', '2011-12-08 11:29:01', '2011-12-09 11:00:13', '2011-12-09 11:03:33', '2012-11-12 09:50:02', '2012-11-12 09:55:29', '2012-11-12 12:39:42', '2012-11-12 14:48:14', '2012-11-12 14:53:14', '2012-11-12 15:31:53', '2012-11-12 19:00:56', '2012-11-12 22:37:55', '2012-11-12 22:40:55', '2012-11-13 09:00:26', '2012-11-13 09:10:12', '2012-11-13 10:51:55', '2012-11-13 14:03:31', '2012-11-13 14:18:36', '2012-11-13 14:42:36', '2012-11-13 17:36:34', '2012-11-13 19:45:03', '2012-11-13 23:15:33', '2012-11-13 23:37:33', '2012-11-14 09:09:41', '2012-11-14 09:21:43', '2012-11-14 11:43:23', '2012-11-14 11:06:23', '2012-11-14 21:35:17', '2012-11-14 21:47:18', '2012-11-14 22:17:47', '2012-11-15 09:44:06', '2012-11-15 09:48:08', '2012-11-15 10:23:49', '2012-11-15 15:40:32', '2012-11-15 15:46:32', '2012-11-15 22:22:44', '2012-11-15 22:31:00', '2012-11-16 10:42:13')
ets3 <- c('2012-11-16 10:52:58', '2012-11-16 12:09:57', '2012-11-16 14:58:55', '2012-11-16 14:55:55', '2012-11-16 18:14:49', '2012-11-19 10:17:12', '2012-11-19 10:33:59', '2012-11-19 16:07:49', '2012-11-19 21:59:01', '2012-11-19 22:24:58', '2012-11-19 22:31:59', '2012-11-20 10:21:02', '2012-11-20 10:37:51', '2012-11-20 11:14:44', '2012-11-20 13:28:35', '2012-11-20 14:40:16', '2012-11-20 15:10:16', '2012-11-21 10:06:50', '2012-11-21 15:14:47', '2012-11-21 15:30:55', '2012-11-21 17:55:48', '2012-11-22 01:45:42', '2012-11-22 10:25:45', '2012-11-22 10:59:45', '2012-11-22 11:10:30', '2012-11-22 12:09:07', '2012-11-22 15:12:19', '2012-11-22 15:26:18', '2012-11-22 16:51:54', '2012-11-22 18:17:25', '2012-11-23 00:41:13', '2012-11-23 10:28:57', '2012-11-23 10:01:57', '2012-11-23 12:57:33', '2012-11-23 14:20:47', '2012-11-23 14:38:47', '2012-11-23 16:57:43', '2012-11-23 18:06:38', '2012-11-26 10:37:28', '2012-11-26 10:05:28', '2012-11-27 10:30:43', '2012-11-27 10:44:43', '2012-11-27 11:54:59', '2012-11-27 19:46:56', '2012-11-28 09:33:52', '2012-11-28 09:44:52', '2012-11-28 12:57:42', '2012-11-28 13:38:45', '2012-11-28 19:45:20', '2012-11-28 22:18:43', '2012-11-30 11:45:40', '2012-11-30 11:51:40', '2012-11-30 15:05:54', '2012-11-30 15:20:00', '2012-11-30 17:42:59', '2012-11-30 22:15:48', '2012-11-30 22:39:48', '2011-11-28 10:42:55', '2011-11-28 10:49:00', '2011-11-28 14:41:54', '2011-11-28 15:04:00', '2011-11-28 20:20:59', '2011-11-29 12:10:37', '2011-11-29 12:19:00', '2011-11-29 13:25:32', '2011-11-29 15:22:57', '2011-11-29 15:49:00', '2011-11-29 16:32:23', '2011-11-30 10:27:58', '2011-11-30 10:38:58', '2011-11-30 13:05:31', '2011-11-30 14:55:24', '2011-11-30 15:11:00', '2011-11-30 16:41:09', '2011-11-30 15:08:00', '2011-12-01 11:19:43', '2011-12-01 11:29:43', '2011-12-01 14:36:38', '2011-12-02 12:31:10', '2011-12-02 12:37:10', '2011-12-02 14:50:19', '2011-12-02 15:24:00', '2011-12-02 19:40:50', '2011-12-05 12:17:58', '2011-12-05 12:26:02', '2011-12-05 15:13:55', '2011-12-05 15:42:00', '2011-12-05 19:24:16', '2011-12-06 11:32:49', '2011-12-06 11:38:51', '2011-12-06 14:55:18', '2011-12-06 15:18:18', '2011-12-06 19:22:55', '2011-12-07 11:17:14', '2011-12-07 11:22:35', '2011-12-07 14:13:34', '2011-12-07 14:41:00', '2011-12-07 20:38:18', '2011-12-08 11:28:24', '2011-12-08 11:35:55', '2011-12-09 11:03:09', '2011-12-09 11:09:08')
etsF <- c(ets1, ets2, ets3)
eatData <- tibble(case_id=eci,
activity=factor(c(ea1, ea2)),
activity_instance_id=eaii,
lifecycle_id=factor(elci),
resource=factor("UNDEFINED"),
timestamp=as.POSIXct(etsF)
)
eat_patterns <- eventlog(eatData,
case_id = "case_id",
activity_id = "activity",
activity_instance_id = "activity_instance_id",
lifecycle_id = "lifecycle_id",
timestamp = "timestamp",
resource_id = "resource")
# Create performance map
eat_patterns %>% process_map(type = performance(FUN = median, units = "hours"))
# Inspect variation in activity durations graphically
eat_patterns %>% processing_time(level = "activity") %>% plot()
# Draw dotted chart
eat_patterns %>% dotted_chart(x = "relative_day", sort = "start_day", units = "secs")
## Joining, by = "case_id"
# Time per activity
# daily_activities %>% processing_time(level = "activity") %>% plot
# Average duration of recordings
# daily_activities %>% throughput_time(level="log", units = "hours")
# Missing activities
# daily_activities %>% idle_time(level="log", units = "hours")
# Distribution throughput time
# vacancies %>% throughput_time(units="days")
# Distribution throughput time per department
# vacancies %>% group_by(vacancy_department) %>% throughput_time(units="days") %>% plot()
# Repetitions of activities
# vacancies %>% number_of_repetitions(level = "activity") %>% arrange(-relative)
Chapter 3 - Event Data Processing
Filtering cases:
Filtering events - trim, frequency, label, general attribute:
Aggregating events - Is-A and Part-of:
Enriching events - mutation (adding calculated variables):
Example code includes:
# Select top 20% of cases according to trace frequency
happy_path <- filter_trace_frequency(vacancies, percentage = 0.2)
# Visualize using process map
happy_path %>% process_map(type=requency(value = "absolute_case"))
# Compute throughput time
happy_path %>% throughput_time(units="days")
# Find no_declines
no_declines <- filter_activity_presence(vacancies, activities = "Decline Candidate", reverse=TRUE)
# What is the average number of
first_hit <- filter_activity_presence(vacancies, activities = c("Send Offer", "Offer Accepted"), method="all")
# Create a performance map
first_hit %>% process_map(type=performance())
# Compute throughput time
first_hit %>% throughput_time()
# Create not_refused
not_refused <- vacancies %>% filter_precedence(antecedents = "Receive Response", consequents = "Review Non Acceptance", precedence_type = "directly_follows", filter_method = "none")
# Select longest_cases
worst_cases <- not_refused %>% filter_throughput_time(interval=c(300, NA))
# Show the different traces
worst_cases %>% trace_explorer(coverage=1)
# Select activities
disapprovals <- vacancies %>% filter_activity(activities=c("Construct Offer", "Disapprove Offer", "Revise Offer","Disapprove Revision", "Restart Procedure"))
# Explore traces
disapprovals %>% trace_explorer(coverage=0.8)
# Performance map
disapprovals %>% process_map(type = performance(FUN = sum, units = "weeks"))
# Select cases
high_paid <- vacancies %>% filter(vacancy_department=="R&D", vacancy_salary_range==">100000")
# Most active resources
high_paid %>% resource_frequency(level="resource")
# Create a dotted chart
high_paid %>% dotted_chart(x="absolute", sort="start")
# Filtered dotted chart
library(lubridate)
high_paid %>% filter_time_period(interval = ymd(c("20180321","20180620")), filter_method = "trim") %>% dotted_chart(x="absolute", sort="start")
# Count activities and instances
n_activities(vacancies)
n_activity_instances(vacancies)
# Combine activities
united_vacancies <- vacancies %>%
act_unite("Disapprove Contract Offer" = c("Disapprove Offer","Disapprove Revision"),
"Approve Contract Offer" = c("Approve Offer","Approve Revision"),
"Construct Contract Offer" = c("Construct Offer","Revise Offer")
)
# Count activities and instances
n_activities(united_vacancies)
n_activity_instances(united_vacancies)
# Aggregate sub processes
aggregated_vacancies <- act_collapse(united_vacancies,
"Interviews" = c("First Interview","Second Interview","Third Interview"),
"Prepare Recruitment" = c("Publish Position","File Applications","Check References"),
"Create Offer" = c("Construct Contract Offer", "Disapprove Contract Offer", "Approve Contract Offer")
)
# Calculated number of activities and activity instances
n_activities(aggregated_vacancies)
n_activity_instances(aggregated_vacancies)
# Create performance map
aggregated_vacancies %>% process_map(type=performance())
# Add total_cost
vacancies_cost <- vacancies %>%
group_by_case() %>%
mutate(total_cost = sum(activity_cost, na.rm = TRUE))
# Add cost_impact
vacancies_impact <- vacancies_cost %>%
# Compute throughput time per impact
vacancies_impact %>% group_by(cost_impact) %>% throughput_time(units = "weeks") %>% plot()
# Create cost_profile
vacancies_profile <- vacancies_impact %>%
mutate(cost_profile = case_when(cost_impact == "High" & urgency < 7 ~ "Disproportionate",
cost_impact == "Medium" & urgency < 5 ~ "Excessive",
cost_impact == "Low" & urgency > 6 ~ "Lacking",
TRUE ~ "Appropriate"))
# Compare number of cases
vacancies_profile %>%
group_by(cost_profile) %>%
n_cases()
# Explore lacking traces
vacancies_profile %>%
filter(cost_profile == "Lacking") %>%
process_map()
Chapter 4 - Case Study
Preparing the event data - example includes data from Sales, Purchasing, Manufacturing, Packaging & Delivery, Accounting:
Getting to know the process:
Roles and rules:
Fast production, fast delivery:
Course recap:
Example code includes:
quotations <- readRDS("./RInputFiles/otc_quotations.RDS")
# Inspect quotations
str(quotations)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1833 obs. of 17 variables:
## $ quotation_id : chr "quo-1003" "quo-1004" "quo-1006" "quo-1008" ...
## $ cancelled_at : chr "2017-05-22 13:28:04" NA NA NA ...
## $ cancelled_by : Factor w/ 20 levels "Amy","Andrea",..: 10 NA NA NA 8 NA NA NA NA NA ...
## $ manufactContacted_at : chr "2017-04-22 17:58:11" "2017-06-18 13:47:50" "2017-10-28 13:55:51" NA ...
## $ manufactContacted_by : Factor w/ 20 levels "Amy","Andrea",..: 11 11 11 NA NA NA 11 14 NA NA ...
## $ received_at : chr "2017-04-16 20:34:12" "2017-06-09 11:19:31" "2017-10-14 18:55:47" "2017-09-08 13:29:05" ...
## $ received_by : Factor w/ 20 levels "Amy","Andrea",..: 2 8 8 8 8 8 10 8 2 2 ...
## $ reminded_at : chr "2017-05-14 19:06:41" NA NA NA ...
## $ reminded_by : Factor w/ 20 levels "Amy","Andrea",..: 8 NA NA NA 8 NA 8 8 NA NA ...
## $ send_at : chr "2017-05-08 14:20:30" "2017-07-02 18:50:58" "2017-11-09 11:27:11" NA ...
## $ send_by : Factor w/ 20 levels "Amy","Andrea",..: 10 2 2 NA 2 NA 2 2 NA 2 ...
## $ supplierContacted_at : chr "2017-04-29 13:43:18" "2017-06-20 12:19:31" "2017-10-26 18:06:29" NA ...
## $ supplierContacted_by : Factor w/ 20 levels "Amy","Andrea",..: 14 11 11 NA 11 NA 11 14 NA 14 ...
## $ supplierOfferReceived_at: chr "2017-05-03 19:09:21" "2017-06-23 19:33:10" "2017-10-30 10:36:44" NA ...
## $ supplierOfferReceived_by: Factor w/ 20 levels "Amy","Andrea",..: 14 11 14 NA 14 NA 14 14 NA 14 ...
## $ warehouseContacted_at : chr "2017-04-24 19:36:10" "2017-06-15 19:30:07" "2017-10-22 17:57:26" NA ...
## $ warehouseContacted_by : Factor w/ 20 levels "Amy","Andrea",..: 11 11 11 NA 14 NA 11 14 NA 14 ...
# Create offer_history
offer_history <- quotations %>%
gather(key, value, -quotation_id) %>%
separate(key, into = c("activity", "info"))
## Warning: attributes are not identical across measure variables;
## they will be dropped
# Recode the key variable
offer_history <- offer_history %>%
mutate(info = fct_recode(info, "timestamp" = 'at', "resource" = 'by'))
# Spread the info variable
offer_history <- offer_history %>%
spread(info, value)
validations <- readRDS("./RInputFiles/otc_validations.RDS")
# Inspect validations
str(validations)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1833 obs. of 4 variables:
## $ quotation_id: chr "quo-1003" "quo-1004" "quo-1006" "quo-1008" ...
## $ resource : chr "Jonathan" "Andrea" "Katherine" "Andrea" ...
## $ started : chr "2017-04-17 14:59:08" "2017-06-11 13:10:45" "2017-10-16 15:59:18" "2017-09-09 17:58:39" ...
## $ completed : chr "2017-04-19 18:32:57" "2017-06-13 12:18:57" "2017-10-18 16:21:56" "2017-09-12 20:58:14" ...
# Create validate_history
validate_history <- validations %>%
mutate(
activity = "Validate",
action = paste(quotation_id, "validate", sep = "-"))
# Gather the timestamp columns
validate_history <- validate_history %>%
gather(lifecycle, timestamp, started, completed)
# Recode the lifecycle column of validate_history
validate_history <- validate_history %>%
mutate(lifecycle = fct_recode(lifecycle,
"start" = "started",
"complete" = "completed"))
# Add lifecycle and action column to offer_history
offer_history <- offer_history %>%
mutate(
lifecycle = "complete",
action = paste(quotation_id, 1:n(), sep = "-"))
# Create sales_history
sales_history <- bind_rows(validate_history, offer_history)
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
sales_history <- readRDS("./RInputFiles/otc_sales_history.RDS")
order_history <- readRDS("./RInputFiles/otc_order_history.RDS")
# sales_quotations <- readRDS("./RInputFiles/otc_sales_quotation.RDS")
str(sales_history)
## Classes 'tbl_df', 'tbl' and 'data.frame': 14695 obs. of 7 variables:
## $ quotation_id : chr "quo-1003" "quo-1004" "quo-1006" "quo-1008" ...
## $ resource : chr "Jonathan" "Andrea" "Katherine" "Andrea" ...
## $ activity : chr "Validate" "Validate" "Validate" "Validate" ...
## $ action : chr "quo-1003-validate" "quo-1004-validate" "quo-1006-validate" "quo-1008-validate" ...
## $ lifecycle : chr "start" "start" "start" "start" ...
## $ timestamp : chr "2017-04-17 14:59:08" "2017-06-11 13:10:45" "2017-10-16 15:59:18" "2017-09-09 17:58:39" ...
## $ sales_order_id: chr NA "order-17-56548" "order-17-56550" NA ...
str(order_history)
## Classes 'tbl_df', 'tbl' and 'data.frame': 60804 obs. of 8 variables:
## $ sales_order_id: chr "order-17-56542" "order-17-56542" "order-17-56543" "order-17-56543" ...
## $ action : chr "order-17-56542-0000001" "order-17-56542-0000002" "order-17-56543-0000003" "order-17-56543-0000004" ...
## $ activity : Factor w/ 37 levels "Assemble Order",..: 24 35 24 35 24 35 24 35 24 35 ...
## $ resource : Factor w/ 20 levels "Amy","Andrea",..: 10 8 2 8 2 8 10 8 2 8 ...
## $ status : Factor w/ 2 levels "complete","start": 2 2 2 2 2 2 2 2 2 2 ...
## $ time : POSIXct, format: "2017-10-17 12:37:22" "2017-10-19 15:30:40" ...
## $ activity_cost : num NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ...
## $ quotation_id : chr NA NA NA NA ...
# str(sales_quotations)
order_history <- order_history %>%
rename(timestamp=time, lifecycle=status) %>%
select(-activity_cost) %>%
mutate(activity=as.character(activity),
resource=as.character(activity),
lifecycle=as.character(lifecycle)
)
sales_history <- sales_history %>%
mutate(timestamp=lubridate::as_datetime(timestamp))
# sales_history <- sales_history %>% left_join(sales_quotations)
otc <- bind_rows(sales_history, order_history)
# Create the eventlog object
otc <- otc %>%
mutate(case_id = paste(quotation_id, sales_order_id, sep = "-")) %>%
eventlog(
case_id = "case_id",
activity_id = "activity",
activity_instance_id = "action",
timestamp = "timestamp",
resource_id = "resource",
lifecycle_id = "lifecycle"
)
# Create trace coverage graph
trace_coverage(otc, level="trace") %>% plot()
# Explore traces
otc %>%
trace_explorer(coverage = 0.25)
# Collapse activities
otc_high_level <- act_collapse(otc, "Delivery" = c(
"Handover To Deliverer",
"Order Delivered",
"Present For Collection",
"Order Fetched")
)
# Draw a process map
process_map(otc_high_level)
# Redraw the trace coverage graph
otc_high_level %>% trace_coverage(level="trace") %>% plot()
# Compute activity wise processing time
otc_high_level %>% processing_time(level="activity", units="days")
## # A tibble: 34 x 11
## activity min q1 mean median q3 max st_dev iqr total
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Packagi~ 0 0 0 0 0 0 0 0 0
## 2 Prepare~ 0 0 0 0 0 0 0 0 0
## 3 Produce~ 0 0 0 0 0 0 0 0 0
## 4 Quality~ 0 0 0 0 0 0 0 0 0
## 5 Assembl~ 0 0 0 0 0 0 0 0 0
## 6 Delivery 0.583 1.99 5.11 3.11 8.06 17.0 3.86 6.07 15452.
## 7 Order M~ 0 0 0 0 0 0 0 0 0
## 8 Receive~ 0 0 0 0 0 0 0 0 0
## 9 Receive~ 0 0 0 0 0 0 0 0 0
## 10 Schedul~ 0 0 0 0 0 0 0 0 0
## # ... with 24 more rows, and 1 more variable: relative_frequency <dbl>
# Plot a resource activity matrix of otc (does not work in R 3.5.3)
# otc %>% resource_frequency(level = "resource-activity") %>% plot()
# Create otc_selection
otc_selection <- otc %>% filter_activity(activities = c("Send Quotation","Send Invoice"))
# Explore traces
otc %>% trace_explorer(coverage=1)
# Draw a resource map
otc_selection %>% resource_map()
# Create otc_returned
otc_returned <- otc %>% filter_activity_presence("Return Goods")
# Compute percentage of returned orders
n_cases(otc_returned)/n_cases(otc)
## [1] 0.2130923
# Trim cases and visualize
otc_returned %>% filter_trim(start_activities="Return Goods") %>% process_map()
# Time from order to delivery
# otc %>% filter_trim(start_activities="Receive Sales Order", end_activities="Order Delivered") %>%
# processing_time(units="days")
# Plot processing time by type
# otc %>%
# group_by(type) %>%
# throughput_time() %>%
# plot()
Chapter 1 - Hubs of the Network
Network science - include social networks, neural networks, etc.:
Visualizing networks:
Centrality measures:
Example code includes:
# read the nodes file into the variable nodes
nodes <- readr::read_csv("./RInputFiles/nodes.csv")
nodes
# read the ties file into the variable ties
ties <- readr::read_csv("./RInputFiles/ties.csv")
ties
library(igraph)
library(ggraph)
# make the network from the data frame ties and print it
g <- graph_from_data_frame(ties, directed = FALSE, vertices = nodes)
g
# explore the set of nodes
V(g)
# print the number of nodes
vcount(g)
# explore the set of ties
E(g)
# print the number of ties
ecount(g)
# give the name "Madrid network" to the network and print the network `name` attribute
g$name <- "Madrid network"
g$name
# add node attribute id and print the node `id` attribute
V(g)$id <- 1:vcount(g)
V(g)$id
# print the tie `weight` attribute
E(g)$weight
# print the network and spot the attributes
g
# visualize the network with layout Kamada-Kawai
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = weight)) +
geom_node_point()
# add an id label to nodes
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = weight)) +
geom_node_point() +
geom_node_text(aes(label = id), repel=TRUE)
# visualize the network with circular layout. Set tie transparency proportional to its weight
ggraph(g, layout = "in_circle") +
geom_edge_link(aes(alpha = weight)) +
geom_node_point()
# visualize the network with grid layout. Set tie transparency proportional to its weight
ggraph(g, layout = "grid") +
geom_edge_link(aes(alpha = weight)) +
geom_node_point()
# compute the degrees of the nodes
dgr <- degree(g)
# add the degrees to the data frame object
nodes <- mutate(nodes, degree = dgr)
# add the degrees to the network object
V(g)$degree <- dgr
# arrange the terrorists in decreasing order of degree
arrange(nodes, -degree)
# compute node strengths
stg <- strength(g)
# add strength to the data frame object using mutate
nodes <- mutate(nodes, strength = stg)
# add the variable stg to the network object as strength
V(g)$strength <- stg
# arrange terrorists in decreasing order of strength and then in decreasing order of degree
arrange(nodes, -degree)
arrange(nodes, -strength)
Chapter 2 - Weakness and strength
Tie betweenness:
Visualizing centrality measures:
The strength of weak ties:
Example code includes:
# save the inverse of tie weights as dist_weight
dist_weight <- 1 / E(g)$weight
# compute weighted tie betweenness
btw <- edge_betweenness(g, weights = dist_weight)
# mutate the data frame ties adding a variable betweenness using btw
ties <- mutate(ties, betweenness=btw)
# add the tie attribute betweenness to the network
E(g)$betweenness <- btw
# join ties with nodes
ties_joined <- ties %>%
left_join(nodes, c("from" = "id")) %>%
left_join(nodes, c("to" = "id"))
# select only relevant variables and save to ties
ties_selected <- ties_joined %>%
select(from, to, name_from = name.x, name_to = name.y, betweenness)
# arrange named ties in decreasing order of betweenness
arrange(ties_selected, -betweenness)
# set (alpha) proportional to weight and node size proportional to degree
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha=weight)) +
geom_node_point(aes(size=degree))
# produce the same visualization but set node size proportional to strength
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = weight)) +
geom_node_point(aes(size = strength))
# visualize the network with tie transparency proportional to betweenness
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = betweenness)) +
geom_node_point()
# add node size proportional to degree
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = betweenness)) +
geom_node_point(aes(size = degree))
# find median betweenness
q = median(E(g)$betweenness)
# filter ties with betweenness larger than the median
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = betweenness, filter = (betweenness > q))) +
geom_node_point() +
theme(legend.position="none")
# find number and percentage of weak ties
ties %>%
group_by(weight) %>%
summarise(number = n(), percentage=n()/nrow(.)) %>%
arrange(-number)
# build vector weakness containing TRUE for weak ties
weakness <- ifelse(ties$weight == 1, TRUE, FALSE)
# check that weakness contains the correct number of weak ties
sum(weakness)
# visualize the network by coloring the weak and strong ties
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(color = weakness)) +
geom_node_point()
# visualize the network with only weak ties using the filter aesthetic
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(filter=weakness), alpha = 0.5) +
geom_node_point()
Chapter 3 - Connection patterns
Connection patterns:
Pearson correlation coefficient:
Most similar and most dissimilar terrorists:
Example code includes:
# mutate ties data frame by swapping variables from and to
ties_mutated <- mutate(ties, temp = to, to = from, from = temp) %>% select(-temp)
# append ties_mutated data frame to ties data frame
ties <- rbind(ties, ties_mutated)
# use a scatter plot to visualize node connection patterns in ties setting color aesthetic to weight
ggplot(ties, aes(x = from, y = to, color = factor(weight))) +
geom_point() +
labs(color = "weight")
# get the weighted adjacency matrix
A <- as_adjacency_matrix(g, attr = "weight", sparse = FALSE, names = FALSE)
# print the first row and first column of A
A[1, ]
A[, 1]
# print submatrix of the first 6 rows and columns
A[1:6, 1:6]
# obtain a vector of node strengths
rowSums(A)
# build a Boolean (0/1) matrix from the weighted matrix A
B <- ifelse(A > 0, 1, 0)
# obtain a vector of node degrees using the Boolean matrix
rowSums(B)
# compute the Pearson correlation on columns of A
S <- cor(A)
# set the diagonal of S to 0
diag(S) = 0
# print a summary of the similarities in matrix S
summary(c(S))
# plot a histogram of similarities in matrix S
hist(c(S), xlab = "Similarity", main = "Histogram of similarity")
# Scatter plot of degree and strength with regression line
ggplot(nodes, aes(x = degree, y = strength)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# Pearson correlation coefficient
cor(nodes$degree, nodes$strength)
# build weighted similarity network and save to h
h <- graph_from_adjacency_matrix(S, mode = "undirected", weighted = TRUE)
# convert the similarity network h into a similarity data frame sim_df
sim_df <- as_data_frame(h, what = "edges")
# map the similarity data frame to a tibble and save it as sim_tib
sim_tib <- as_tibble(sim_df)
# print sim_tib
sim_tib
# left join similarity and nodes data frames and then select and rename relevant variables
sim2 <- sim_tib %>%
left_join(nodes, c("from" = "id")) %>%
left_join(nodes, c("to" = "id")) %>%
select(from, to, name_from = name.x, name_to = name.y, similarity = weight,
degree_from = degree.x, degree_to = degree.y, strength_from = strength.x, strength_to = strength.y)
# print sim2
sim2
# arrange sim2 in decreasing order of similarity.
sim2 %>% arrange(-similarity)
# filter sim2, allowing only pairs with a degree of least 10, arrange the result in decreasing order of similarity
sim2 %>%
filter(degree_from >= 10, degree_to >= 10) %>%
arrange(-similarity)
# Repeat the previous steps, but in increasing order of similarity
sim2 %>%
filter(degree_from >= 10, degree_to >= 10) %>%
arrange(similarity)
# filter the similarity data frame to similarities larger than or equal to 0.60
sim3 <- filter(sim2, similarity >= 0.6)
# build a similarity network called h2 from the filtered similarity data frame
h2 <- graph_from_data_frame(sim3, directed = FALSE)
# visualize the similarity network h2
ggraph(h2, layout = "with_kk") +
geom_edge_link(aes(alpha = similarity)) +
geom_node_point()
Chapter 4 - Similarity Clusters
Hierarchical clustering - find clusters of similar people:
Interactive visualizations with visNetwork:
Wrap up:
Example code includes:
# compute a distance matrix
D <- 1 - S
# obtain a distance object
d <- as.dist(D)
# run average-linkage clustering method and plot the dendrogram
cc <- hclust(d, method = "average")
plot(cc)
# find the similarity of the first pair of nodes that have been merged
S[40, 45]
# cut the dendrogram at 4 clusters
cls <- cutree(cc, k = 4)
# add cluster information to the nodes data frame
nodes <- mutate(nodes, cluster = cls)
# print the nodes data frame
nodes
# output the names of terrorists in the first cluster
filter(nodes, cluster == 1) %>%
select(name)
# for each cluster select the size of the cluster, the average node degree, and the average node strength and sorts by cluster size
group_by(nodes, cluster) %>%
summarise(size = n(),
avg_degree = mean(degree),
avg_strength = mean(strength)
) %>%
arrange(-size)
# add cluster information to the network
V(g)$cluster <- nodes$cluster
# visualize the original network with colored clusters
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = weight), show.legend=FALSE) +
geom_node_point(aes(color = factor(cluster))) +
labs(color = "cluster")
# facet the network with respect to cluster attribute
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = weight), show.legend=FALSE) +
geom_node_point(aes(color = factor(cluster))) +
facet_nodes(~cluster, scales="free") +
labs(color = "cluster")
# convert igraph to visNetwork
data <- visNetwork::toVisNetworkData(g)
# print head of nodes and ties
head(data$nodes)
head(data$edges)
# visualize the network
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300)
# use the circle layout
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_with_kk")
# use the circle layout
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_in_circle")
# use the grid layout
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_on_grid")
# highlight nearest nodes and ties of the selected node
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_with_kk") %>%
visNetwork::visOptions(highlightNearest = TRUE)
# select nodes by id
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_with_kk") %>%
visNetwork::visOptions(nodesIdSelection = TRUE)
# set color to cluster and generate network data
V(g)$color = V(g)$cluster
data <- visNetwork::toVisNetworkData(g)
# select by group (cluster)
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_with_kk") %>%
visNetwork::visOptions(selectedBy = "group")
Chapter 1 - Introduction to Data Privacy
Intro to Anonymization - Part I:
Intro to Anonymization - Part II:
Data Synthesis:
Example code includes:
load("./RInputFiles/dataPriv.RData")
# Preview data
whitehouse
## # A tibble: 469 x 5
## Name Status Salary Basis Title
## <chr> <chr> <dbl> <chr> <chr>
## 1 Abrams, Adam~ Employ~ 66300 Per An~ WESTERN REGIONAL COMMUNICATIONS DIRECTOR
## 2 Adams, Ian H. Employ~ 45000 Per An~ EXECUTIVE ASSISTANT TO THE DIRECTOR OF ~
## 3 Agnew, David~ Employ~ 93840 Per An~ DEPUTY DIRECTOR OF INTERGOVERNMENTAL AF~
## 4 Albino, James Employ~ 91800 Per An~ SENIOR PROGRAM MANAGER
## 5 Aldy, Jr., J~ Employ~ 130500 Per An~ SPECIAL ASSISTANT TO THE PRESIDENT FOR ~
## 6 Alley, Hilar~ Employ~ 42000 Per An~ STAFF ASSISTANT
## 7 Amorsingh, L~ Employ~ 56092 Per An~ SPECIAL ASSISTANT
## 8 Anderson, Am~ Employ~ 60000 Per An~ SPECIAL ASSISTANT TO THE CHIEF OF STAFF
## 9 Anderson, Ch~ Employ~ 51000 Per An~ POLICY ASSISTANT
## 10 Andrias, Kat~ Employ~ 130500 Per An~ SPECIAL ASSISTANT TO THE PRESIDENT AND ~
## # ... with 459 more rows
# Set seed
set.seed(42)
# Replace names with random numbers from 1 to 1000
whitehouse_no_names <- whitehouse %>%
mutate(Name = sample(1:1000, nrow(.), replace=FALSE))
whitehouse_no_names
## # A tibble: 469 x 5
## Name Status Salary Basis Title
## <int> <chr> <dbl> <chr> <chr>
## 1 915 Employee 66300 Per An~ WESTERN REGIONAL COMMUNICATIONS DIRECTOR
## 2 937 Employee 45000 Per An~ EXECUTIVE ASSISTANT TO THE DIRECTOR OF SCHEDUL~
## 3 286 Employee 93840 Per An~ DEPUTY DIRECTOR OF INTERGOVERNMENTAL AFFAIRS
## 4 828 Employee 91800 Per An~ SENIOR PROGRAM MANAGER
## 5 640 Employee 130500 Per An~ SPECIAL ASSISTANT TO THE PRESIDENT FOR ENERGY ~
## 6 517 Employee 42000 Per An~ STAFF ASSISTANT
## 7 733 Employee 56092 Per An~ SPECIAL ASSISTANT
## 8 134 Employee 60000 Per An~ SPECIAL ASSISTANT TO THE CHIEF OF STAFF
## 9 652 Employee 51000 Per An~ POLICY ASSISTANT
## 10 699 Employee 130500 Per An~ SPECIAL ASSISTANT TO THE PRESIDENT AND ASSOCIA~
## # ... with 459 more rows
# Rounding Salary to the nearest ten thousand
whitehouse_no_identifiers <- whitehouse_no_names %>%
mutate(Salary = round(Salary, -4))
whitehouse_no_identifiers
## # A tibble: 469 x 5
## Name Status Salary Basis Title
## <int> <chr> <dbl> <chr> <chr>
## 1 915 Employee 70000 Per An~ WESTERN REGIONAL COMMUNICATIONS DIRECTOR
## 2 937 Employee 40000 Per An~ EXECUTIVE ASSISTANT TO THE DIRECTOR OF SCHEDUL~
## 3 286 Employee 90000 Per An~ DEPUTY DIRECTOR OF INTERGOVERNMENTAL AFFAIRS
## 4 828 Employee 90000 Per An~ SENIOR PROGRAM MANAGER
## 5 640 Employee 130000 Per An~ SPECIAL ASSISTANT TO THE PRESIDENT FOR ENERGY ~
## 6 517 Employee 40000 Per An~ STAFF ASSISTANT
## 7 733 Employee 60000 Per An~ SPECIAL ASSISTANT
## 8 134 Employee 60000 Per An~ SPECIAL ASSISTANT TO THE CHIEF OF STAFF
## 9 652 Employee 50000 Per An~ POLICY ASSISTANT
## 10 699 Employee 130000 Per An~ SPECIAL ASSISTANT TO THE PRESIDENT AND ASSOCIA~
## # ... with 459 more rows
# Convert the salaries into three categories
whitehouse.gen <- whitehouse %>%
mutate(Salary = ifelse(Salary < 50000, 0,
ifelse(Salary >= 50000 & Salary < 100000, 1, 2)))
whitehouse.gen
## # A tibble: 469 x 5
## Name Status Salary Basis Title
## <chr> <chr> <dbl> <chr> <chr>
## 1 Abrams, Adam~ Employ~ 1 Per An~ WESTERN REGIONAL COMMUNICATIONS DIRECTOR
## 2 Adams, Ian H. Employ~ 0 Per An~ EXECUTIVE ASSISTANT TO THE DIRECTOR OF ~
## 3 Agnew, David~ Employ~ 1 Per An~ DEPUTY DIRECTOR OF INTERGOVERNMENTAL AF~
## 4 Albino, James Employ~ 1 Per An~ SENIOR PROGRAM MANAGER
## 5 Aldy, Jr., J~ Employ~ 2 Per An~ SPECIAL ASSISTANT TO THE PRESIDENT FOR ~
## 6 Alley, Hilar~ Employ~ 0 Per An~ STAFF ASSISTANT
## 7 Amorsingh, L~ Employ~ 1 Per An~ SPECIAL ASSISTANT
## 8 Anderson, Am~ Employ~ 1 Per An~ SPECIAL ASSISTANT TO THE CHIEF OF STAFF
## 9 Anderson, Ch~ Employ~ 1 Per An~ POLICY ASSISTANT
## 10 Andrias, Kat~ Employ~ 2 Per An~ SPECIAL ASSISTANT TO THE PRESIDENT AND ~
## # ... with 459 more rows
# Bottom Coding
whitehouse.bottom <- whitehouse %>%
mutate(Salary = pmax(Salary, 45000))
# Filter Results
whitehouse.bottom %>%
filter(Salary <= 45000)
## # A tibble: 109 x 5
## Name Status Salary Basis Title
## <chr> <chr> <dbl> <chr> <chr>
## 1 Adams, Ian H. Employee 45000 Per An~ EXECUTIVE ASSISTANT TO THE DIRECTOR O~
## 2 Alley, Hilary~ Employee 45000 Per An~ STAFF ASSISTANT
## 3 Asen, Jonatha~ Employee 45000 Per An~ SENIOR ANALYST
## 4 Ayling, Linds~ Employee 45000 Per An~ ANALYST
## 5 Baggetto, Mau~ Employee 45000 Per An~ STAFF ASSISTANT
## 6 Bates, Andrew~ Employee 45000 Per An~ MEDIA MONITOR
## 7 Belive, Laure~ Employee 45000 Per An~ LEGISLATIVE ASSISTANT AND ASSISTANT F~
## 8 Bisi, Rachel ~ Employee 45000 Per An~ LEGISLATIVE ASSISTANT
## 9 Block, Michae~ Employee 45000 Per An~ STAFF ASSISTANT
## 10 Blount, Patri~ Employee 45000 Per An~ RECORDS MANAGEMENT ANALYST
## # ... with 99 more rows
# View fertility data
fertility
## # A tibble: 100 x 10
## Season Age Child_Disease Accident_Trauma Surgical_Interv~ High_Fevers
## <dbl> <dbl> <int> <int> <int> <int>
## 1 -0.33 0.69 0 1 1 0
## 2 -0.33 0.94 1 0 1 0
## 3 -0.33 0.5 1 0 0 0
## 4 -0.33 0.75 0 1 1 0
## 5 -0.33 0.67 1 1 0 0
## 6 -0.33 0.67 1 0 1 0
## 7 -0.33 0.67 0 0 0 -1
## 8 -0.33 1 1 1 1 0
## 9 1 0.64 0 0 1 0
## 10 1 0.61 1 0 0 0
## # ... with 90 more rows, and 4 more variables: Alcohol_Freq <dbl>,
## # Smoking <int>, Hours_Sitting <dbl>, Diagnosis <int>
# Number of participants with Surgical_Intervention and Diagnosis
fertility %>%
summarise_at(vars(Surgical_Intervention, Diagnosis), sum)
## # A tibble: 1 x 2
## Surgical_Intervention Diagnosis
## <int> <int>
## 1 51 12
# Mean and Standard Deviation of Age
fertility %>%
summarise_at(vars(Age), funs(mean, sd))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
## # A tibble: 1 x 2
## mean sd
## <dbl> <dbl>
## 1 0.669 0.121
# Counts of the Groups in High_Fevers
fertility %>%
count(High_Fevers)
## # A tibble: 3 x 2
## High_Fevers n
## <int> <int>
## 1 -1 9
## 2 0 63
## 3 1 28
# Counts of the Groups in Child_Disease
fertility %>%
count(Child_Disease, Accident_Trauma)
## # A tibble: 4 x 3
## Child_Disease Accident_Trauma n
## <int> <int> <int>
## 1 0 0 10
## 2 0 1 3
## 3 1 0 46
## 4 1 1 41
# Find proportions
fertility %>%
summarise_at(vars(Accident_Trauma, Surgical_Intervention), mean)
## # A tibble: 1 x 2
## Accident_Trauma Surgical_Intervention
## <dbl> <dbl>
## 1 0.44 0.51
# Set seed
set.seed(42)
# Generate Synthetic data
accident <- rbinom(100, 1, prob=0.440)
surgical <- rbinom(100, 1, prob=0.510)
# Square root Transformation of Salary
whitehouse.salary <- whitehouse %>%
mutate(Salary = sqrt(Salary))
# Calculate the mean and standard deviation
stats <- whitehouse.salary %>%
summarize(mean(Salary), sd(Salary))
stats
## # A tibble: 1 x 2
## `mean(Salary)` `sd(Salary)`
## <dbl> <dbl>
## 1 279. 71.8
# Generate Synthetic data
set.seed(42)
salary_transformed <- rnorm(nrow(whitehouse), mean=279, sd=71.8)
# Power transformation
salary_original <- salary_transformed ** 2
# Hard bound
salary <- ifelse(salary_original < 0, 0, salary_original)
Chapter 2 - Introduction to Differential Privacy
Differential Privacy - quantification of privacy loss via a privacy budget:
Global Sensitivity - usual decision-making factor for differential privacy:
Laplace Mechanism - adds noise based on the Laplace distribution with mean 0 and parameters global sensitivity and privacy budget:
Example code includes:
# Number of observations
n <- nrow(fertility)
# Global sensitivity of counts
gs.count <- 1
# Global sensitivity of proportions
gs.prop <- 1/n
# Lower bound of Hours_Sitting
a <- 0
# Upper bound of Hours_Sitting
b <- 1
# Global sensitivity of mean for Hours_Sitting
gs.mean <- (b - a) / n
# Global sensitivity of proportions Hours_Sitting
gs.var <- (b - a)**2 / n
# How many participants had a Surgical_Intervention?
fertility %>%
summarise_at(vars(Surgical_Intervention), sum)
## # A tibble: 1 x 1
## Surgical_Intervention
## <int>
## 1 51
# Set the seed
set.seed(42)
# Apply the Laplace mechanism
eps <- 0.1
smoothmest::rdoublex(1, 51, 1/eps)
## [1] 52.98337
# Proportion of Accident_Trauma
stats <- fertility %>%
summarise_at(vars(Accident_Trauma), mean)
stats
## # A tibble: 1 x 1
## Accident_Trauma
## <dbl>
## 1 0.44
# Set the seed
set.seed(42)
# Apply the Laplace mechanism
eps <- 0.1
smoothmest::rdoublex(1, 0.440, (1/n)/eps)
## [1] 0.4598337
# Mean and Variance of Hours Sitting
fertility %>%
summarise_at(vars(Hours_Sitting), funs(mean, var))
## # A tibble: 1 x 2
## mean var
## <dbl> <dbl>
## 1 0.407 0.0347
# Setup
set.seed(42)
eps <- 0.1
# Laplace mechanism to mean
smoothmest::rdoublex(1, 0.41, gs.mean/eps)
## [1] 0.4298337
# Laplace mechanism to variance
smoothmest::rdoublex(1, 0.03, gs.var/eps)
## [1] 0.0583491
Chapter 3 - Differentially Private Properties
Sequential Composition - method to require that someone cannot find the real answer by just sending multiple queries:
Parallel Composition - method to account for queries to different parts of the database (no adjustment to epsilon needed):
Post-processing:
Impossible and inconsistent answers:
Example code includes:
# Set Value of Epsilon
eps <- 0.1 / 2
# Number of observations
n <- nrow(fertility)
# Lower bound of Age
a <- 0
# Upper bound of Age
b <- 1
# GS of counts for Diagnosis
gs.count <- 1
# GS of mean for Age
gs.mean <- (b-a)/n
# Number of Participants with abnormal diagnosis
stats1 <- fertility %>%
summarize_at(vars(Diagnosis), sum)
stats1
## # A tibble: 1 x 1
## Diagnosis
## <int>
## 1 12
# Mean of age
stats2 <- fertility %>%
summarize_at(vars(Age), mean)
stats2
## # A tibble: 1 x 1
## Age
## <dbl>
## 1 0.669
# Set seed
set.seed(42)
# Laplace mechanism to the count of abnormal diagnosis
smoothmest::rdoublex(1, 12, gs.count/eps)
## [1] 15.96674
# Laplace mechanism to the mean of age
smoothmest::rdoublex(1, 0.67, gs.mean/eps)
## [1] 0.7266982
# Set Value of Epsilon
eps <- 0.1
# Mean of Age per diagnosis level
fertility %>%
group_by(Diagnosis) %>%
summarise_at(vars(Age), mean)
## # A tibble: 2 x 2
## Diagnosis Age
## <int> <dbl>
## 1 0 0.664
## 2 1 0.707
# Set the seed
set.seed(42)
# Laplace mechanism to the mean age of participants with an abnormal diagnoisis
smoothmest::rdoublex(1, 0.71, gs.mean/eps)
## [1] 0.7298337
# Laplace mechanism to the mean age of participants with a normal diagnoisis
smoothmest::rdoublex(1, 0.66, gs.mean/eps)
## [1] 0.6883491
# Set Value of Epsilon
eps <- 0.5/3
# GS of Counts
gs.count <- 1
# Number of participants in each of the four seasons
fertility %>%
group_by(Diagnosis) %>%
summarise_at(vars(Age), mean)
## # A tibble: 2 x 2
## Diagnosis Age
## <int> <dbl>
## 1 0 0.664
## 2 1 0.707
# Set the seed
set.seed(42)
# Laplace mechanism to the number of participants who were evaluated in the winter, spring, and summer
winter <- smoothmest::rdoublex(1, 28, gs.count / eps) %>%
round()
spring <- smoothmest::rdoublex(1, 37, gs.count / eps) %>%
round()
summer <- smoothmest::rdoublex(1, 4, gs.count / eps) %>%
round()
# Post-process based on previous queries
fall <- nrow(fertility) - winter - spring - summer
# Set Value of Epsilon
eps <- 0.01
# GS of counts
gs.count <- 1
# Number of Participants with Child_Disease
fertility %>%
summarise_at(vars(Child_Disease), sum)
## # A tibble: 1 x 1
## Child_Disease
## <int>
## 1 87
# Apply the Laplace mechanism
set.seed(42)
lap_childhood <- smoothmest::rdoublex(1, 87, gs.count / eps) %>%
round()
# Total number of observations in fertility
max_value <- nrow(fertility)
# Bound the value such that the noisy answer does not exceed the total number of observations
ifelse(lap_childhood > max_value, max_value, lap_childhood)
## [1] 100
# Set the seed
set.seed(42)
# Apply the Laplace mechanism
fever1 <- smoothmest::rdoublex(1, 9, gs.count/eps) %>%
max(0)
fever2 <- smoothmest::rdoublex(1, 63, gs.count/eps) %>%
max(0)
fever3 <- smoothmest::rdoublex(1, 28, gs.count/eps) %>%
max(0)
fever <- c(fever1, fever2, fever3)
# Normalize noise
fever_normalized <- (fever/sum(fever)) * (nrow(fertility))
# Round the values
round(fever_normalized)
## [1] 24 76 0
Chapter 4 - Differentially Private Data Synthesis
Laplace Sanitizer - basic way to generate “noisy” categorical data:
Parametric Approaches:
Wrap up:
Example code includes:
# Set Value of Epsilon
eps <- 0.1
# GS of Counts
gs.count <- 1
# Number of participants in each season
fertility %>%
count(Season)
## # A tibble: 4 x 2
## Season n
## <dbl> <int>
## 1 -1 28
## 2 -0.33 37
## 3 0.33 4
## 4 1 31
# Set the seed
set.seed(42)
# Apply the Laplace mechanism
winter <- smoothmest::rdoublex(1, 28, gs.count/eps) %>% max(0)
spring <- smoothmest::rdoublex(1, 37, gs.count/eps) %>% max(0)
summer <- smoothmest::rdoublex(1, 4, gs.count/eps) %>% max(0)
fall <- smoothmest::rdoublex(1, 31, gs.count/eps) %>% max(0)
# Store noisy results
seasons <- c(winter = winter, spring = spring, summer = summer, fall = fall)
# Normalizing seasons
seasons_normalized <- (seasons/sum(seasons)) * nrow(fertility)
# Round the values
round(seasons_normalized)
## winter spring summer fall
## 29 38 0 33
# Generate synthetic data for winter
rep(-1, 29)
## [1] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [26] -1 -1 -1 -1
# Generate synthetic data for spring
rep(-0.33, 38)
## [1] -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33
## [13] -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33
## [25] -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33
## [37] -0.33 -0.33
# Generate synthetic data for summer
rep(0.33, 0)
## numeric(0)
# Generate synthetic data for fall
rep(1, 33)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# Calculate proportions
fertility %>%
summarise_at(vars(Accident_Trauma, Surgical_Intervention), mean)
## # A tibble: 1 x 2
## Accident_Trauma Surgical_Intervention
## <dbl> <dbl>
## 1 0.44 0.51
# Number of Observations
n <- nrow(fertility)
# Set Value of Epsilon
eps <- 0.1
# GS of Proportion
gs.prop <- (1/n)
# Apply the Laplace mechanism
set.seed(42)
smoothmest::rdoublex(1, 0.44, gs.prop/eps)
## [1] 0.4598337
smoothmest::rdoublex(1, 0.51, gs.prop/eps)
## [1] 0.5383491
# Generate Synthetic data
set.seed(42)
accident <- rbinom(n, 1, 0.46)
surgical <- rbinom(n, 1, 0.54)
# Set Value of Epsilon
eps <- 0.1 / 2
# Number of observations
n <- nrow(fertility)
# Upper and lower bounds of age
a <- 0
b <- 1
# GS of mean and variance for age
gs.mean <- (b-a) / n
gs.var <- (b-a)**2 / n
# Mean and Variance of Age
fertility %>%
summarise_at(vars(Age), funs(mean, var))
## # A tibble: 1 x 2
## mean var
## <dbl> <dbl>
## 1 0.669 0.0147
# Apply the Laplace mechanism
set.seed(42)
smoothmest::rdoublex(1, 0.67, gs.mean/eps)
## [1] 0.7096674
smoothmest::rdoublex(1, 0.01, gs.var/eps)
## [1] 0.06669821
# Generate Synthetic data
set.seed(42)
age <- rnorm(n, mean=0.71, sd=sqrt(0.07))
# Hard Bounding the data
age[age < 0] <- 0
age[age > 1] <- 1
Chapter 1 - Modeling Customer Lifetime Value with Linear Regression
Introduction - Verena from INWT Statistics (consultancy in marketing analytics):
Simple linear regression - one predictor variable to predict one response variable:
Multiple linear regression:
Model validation, fit, and prediction:
Example code includes:
salesData <- readr::read_csv("./RInputFiles/salesData.csv")
## Parsed with column specification:
## cols(
## id = col_double(),
## nItems = col_double(),
## mostFreqStore = col_character(),
## mostFreqCat = col_character(),
## nCats = col_double(),
## preferredBrand = col_character(),
## nBrands = col_double(),
## nPurch = col_double(),
## salesLast3Mon = col_double(),
## salesThisMon = col_double(),
## daysSinceLastPurch = col_double(),
## meanItemPrice = col_double(),
## meanShoppingCartValue = col_double(),
## customerDuration = col_double()
## )
# Structure of dataset
str(salesData, give.attr = FALSE)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 5122 obs. of 14 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ nItems : num 1469 1463 262 293 108 ...
## $ mostFreqStore : chr "Stockton" "Stockton" "Colorado Springs" "Colorado Springs" ...
## $ mostFreqCat : chr "Alcohol" "Alcohol" "Shoes" "Bakery" ...
## $ nCats : num 72 73 55 50 32 41 36 31 41 52 ...
## $ preferredBrand : chr "Veina" "Veina" "Bo" "Veina" ...
## $ nBrands : num 517 482 126 108 79 98 78 62 99 103 ...
## $ nPurch : num 82 88 56 43 18 35 34 12 26 33 ...
## $ salesLast3Mon : num 2742 2791 1530 1766 1180 ...
## $ salesThisMon : num 1284 1243 683 730 553 ...
## $ daysSinceLastPurch : num 1 1 1 1 12 2 2 4 14 1 ...
## $ meanItemPrice : num 1.87 1.91 5.84 6.03 10.93 ...
## $ meanShoppingCartValue: num 33.4 31.7 27.3 41.1 65.6 ...
## $ customerDuration : num 821 657 548 596 603 673 612 517 709 480 ...
# Visualization of correlations
salesData %>% select_if(is.numeric) %>%
select(-id) %>%
cor() %>%
corrplot::corrplot()
# Frequent stores
ggplot(salesData) +
geom_boxplot(aes(x = mostFreqStore, y = salesThisMon))
# Preferred brand
ggplot(salesData) +
geom_boxplot(aes(x = preferredBrand, y = salesThisMon))
# Model specification using lm
salesSimpleModel <- lm(salesThisMon ~ salesLast3Mon, data = salesData)
# Looking at model summary
summary(salesSimpleModel)
##
## Call:
## lm(formula = salesThisMon ~ salesLast3Mon, data = salesData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -570.18 -68.26 3.21 72.98 605.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.690501 6.083886 16.39 <2e-16 ***
## salesLast3Mon 0.382696 0.004429 86.40 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 117.5 on 5120 degrees of freedom
## Multiple R-squared: 0.5932, Adjusted R-squared: 0.5931
## F-statistic: 7465 on 1 and 5120 DF, p-value: < 2.2e-16
# Estimating the full model
salesModel1 <- lm(salesThisMon ~ . -id, data = salesData)
# Checking variance inflation factors
car::vif(salesModel1)
## Registered S3 methods overwritten by 'car':
## method from
## influence.merMod lme4
## cooks.distance.influence.merMod lme4
## dfbeta.influence.merMod lme4
## dfbetas.influence.merMod lme4
## GVIF Df GVIF^(1/(2*Df))
## nItems 11.772600 1 3.431122
## mostFreqStore 1.260469 9 1.012943
## mostFreqCat 1.527348 9 1.023809
## nCats 8.402073 1 2.898633
## preferredBrand 1.682184 9 1.029316
## nBrands 14.150868 1 3.761764
## nPurch 3.083952 1 1.756119
## salesLast3Mon 8.697663 1 2.949180
## daysSinceLastPurch 1.585057 1 1.258991
## meanItemPrice 1.987665 1 1.409846
## meanShoppingCartValue 2.247579 1 1.499193
## customerDuration 1.004664 1 1.002329
# Estimating new model by removing information on brand
salesModel2 <- lm(salesThisMon ~ . -id -preferredBrand -nBrands, data = salesData)
# Checking variance inflation factors
car::vif(salesModel2)
## GVIF Df GVIF^(1/(2*Df))
## nItems 6.987456 1 2.643380
## mostFreqStore 1.178251 9 1.009154
## mostFreqCat 1.269636 9 1.013351
## nCats 5.813494 1 2.411119
## nPurch 3.069046 1 1.751869
## salesLast3Mon 8.412520 1 2.900435
## daysSinceLastPurch 1.579426 1 1.256752
## meanItemPrice 1.925494 1 1.387622
## meanShoppingCartValue 2.238410 1 1.496132
## customerDuration 1.002981 1 1.001489
salesData2_4 <- readr::read_csv("./RInputFiles/salesDataMon2To4.csv")
## Parsed with column specification:
## cols(
## id = col_double(),
## nItems = col_double(),
## mostFreqStore = col_character(),
## mostFreqCat = col_character(),
## nCats = col_double(),
## preferredBrand = col_character(),
## nBrands = col_double(),
## nPurch = col_double(),
## salesLast3Mon = col_double(),
## daysSinceLastPurch = col_double(),
## meanItemPrice = col_double(),
## meanShoppingCartValue = col_double(),
## customerDuration = col_double()
## )
# getting an overview of new data
summary(salesData2_4)
## id nItems mostFreqStore mostFreqCat
## Min. : 1 Min. : 1.0 Length:5173 Length:5173
## 1st Qu.:1372 1st Qu.: 84.0 Class :character Class :character
## Median :2733 Median : 155.0 Mode :character Mode :character
## Mean :2729 Mean : 185.9
## 3rd Qu.:4085 3rd Qu.: 257.0
## Max. :5455 Max. :1461.0
## nCats preferredBrand nBrands nPurch
## Min. : 1.00 Length:5173 Min. : 1.00 Min. : 1.00
## 1st Qu.:27.00 Class :character 1st Qu.: 45.00 1st Qu.:11.00
## Median :37.00 Mode :character Median : 75.00 Median :17.00
## Mean :36.23 Mean : 81.66 Mean :20.02
## 3rd Qu.:46.00 3rd Qu.:110.00 3rd Qu.:27.00
## Max. :74.00 Max. :484.00 Max. :86.00
## salesLast3Mon daysSinceLastPurch meanItemPrice meanShoppingCartValue
## Min. : 189 Min. : 1.000 Min. : 1.879 Min. : 17.58
## 1st Qu.:1068 1st Qu.: 2.000 1st Qu.: 6.049 1st Qu.: 53.88
## Median :1331 Median : 4.000 Median : 8.556 Median : 75.77
## Mean :1324 Mean : 6.589 Mean : 12.116 Mean : 91.88
## 3rd Qu.:1570 3rd Qu.: 7.000 3rd Qu.: 12.969 3rd Qu.: 109.74
## Max. :2745 Max. :87.000 Max. :313.050 Max. :1147.66
## customerDuration
## Min. : 31.0
## 1st Qu.: 580.0
## Median : 682.0
## Mean : 676.8
## 3rd Qu.: 777.0
## Max. :1386.0
# predicting sales
predSales5 <- predict(salesModel2, newdata = salesData2_4)
# calculating mean of future sales
mean(predSales5)
## [1] 625.1438
Chapter 2 - Logistic Regression for Churn Prevention
Churn prevention in online marketing:
Modeling and model selection:
In-sample model fit and thresholding:
Out-of-sample validation and cross validation:
Example code includes:
defaultData <- readr::read_delim("./RInputFiles/defaultData.csv", delim=";")
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
# Summary of data
summary(defaultData)
## ID limitBal sex education
## Min. : 1 Min. : 10000 Min. :1.000 Min. :0.000
## 1st Qu.: 4501 1st Qu.: 50000 1st Qu.:1.000 1st Qu.:1.000
## Median : 9000 Median : 130000 Median :2.000 Median :2.000
## Mean : 9000 Mean : 162902 Mean :1.588 Mean :1.835
## 3rd Qu.:13500 3rd Qu.: 230000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :18000 Max. :1000000 Max. :2.000 Max. :6.000
## marriage age pay1 pay2
## Min. :0.00 Min. :21.00 Min. :-2.00000 Min. :-2.0000
## 1st Qu.:1.00 1st Qu.:28.00 1st Qu.:-1.00000 1st Qu.:-1.0000
## Median :2.00 Median :34.00 Median : 0.00000 Median : 0.0000
## Mean :1.56 Mean :35.48 Mean : 0.02783 Mean :-0.1017
## 3rd Qu.:2.00 3rd Qu.:41.00 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :3.00 Max. :75.00 Max. : 8.00000 Max. : 8.0000
## pay3 pay4 pay5 pay6
## Min. :-2.0000 Min. :-2.0000 Min. :-2.000 Min. :-2.0000
## 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.000 1st Qu.:-1.0000
## Median : 0.0000 Median : 0.0000 Median : 0.000 Median : 0.0000
## Mean :-0.1294 Mean :-0.1974 Mean :-0.228 Mean :-0.2567
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.: 0.0000
## Max. : 8.0000 Max. : 8.0000 Max. : 8.000 Max. : 8.0000
## billAmt1 billAmt2 billAmt3 billAmt4
## Min. :-165580 Min. :-33350 Min. : -34041 Min. :-170000
## 1st Qu.: 3675 1st Qu.: 3149 1st Qu.: 2655 1st Qu.: 2245
## Median : 22450 Median : 21425 Median : 20035 Median : 18703
## Mean : 50030 Mean : 48131 Mean : 45607 Mean : 41074
## 3rd Qu.: 65001 3rd Qu.: 62157 3rd Qu.: 58457 3rd Qu.: 50540
## Max. : 964511 Max. :983931 Max. :1664089 Max. : 891586
## billAmt5 billAmt6 payAmt1 payAmt2
## Min. :-37594 Min. :-339603 Min. : 0 Min. : 0
## 1st Qu.: 1684 1st Qu.: 1150 1st Qu.: 949 1st Qu.: 696
## Median : 18046 Median : 16780 Median : 2087 Median : 2000
## Mean : 39398 Mean : 38009 Mean : 5532 Mean : 5731
## 3rd Qu.: 49355 3rd Qu.: 48442 3rd Qu.: 5000 3rd Qu.: 5000
## Max. :927171 Max. : 961664 Max. :505000 Max. :1684259
## payAmt3 payAmt4 payAmt5 payAmt6
## Min. : 0 Min. : 0 Min. : 0.0 Min. : 0
## 1st Qu.: 307 1st Qu.: 228 1st Qu.: 209.8 1st Qu.: 2
## Median : 1500 Median : 1486 Median : 1500.0 Median : 1400
## Mean : 4629 Mean : 4757 Mean : 4763.7 Mean : 5135
## 3rd Qu.: 4000 3rd Qu.: 4000 3rd Qu.: 4000.0 3rd Qu.: 4000
## Max. :896040 Max. :497000 Max. :417990.0 Max. :528666
## PaymentDefault
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2306
## 3rd Qu.:0.0000
## Max. :1.0000
# Look at data structure
str(defaultData, give.attr=FALSE)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 18000 obs. of 25 variables:
## $ ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ limitBal : num 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ sex : num 2 2 2 2 1 1 1 2 2 1 ...
## $ education : num 2 2 2 2 2 1 1 2 3 3 ...
## $ marriage : num 1 2 2 1 1 2 2 2 1 2 ...
## $ age : num 24 26 34 37 57 37 29 23 28 35 ...
## $ pay1 : num 2 -1 0 0 -1 0 0 0 0 -2 ...
## $ pay2 : num 2 2 0 0 0 0 0 -1 0 -2 ...
## $ pay3 : num -1 0 0 0 -1 0 0 -1 2 -2 ...
## $ pay4 : num -1 0 0 0 0 0 0 0 0 -2 ...
## $ pay5 : num -2 0 0 0 0 0 0 0 0 -1 ...
## $ pay6 : num -2 2 0 0 0 0 0 -1 0 -1 ...
## $ billAmt1 : num 3913 2682 29239 46990 8617 ...
## $ billAmt2 : num 3102 1725 14027 48233 5670 ...
## $ billAmt3 : num 689 2682 13559 49291 35835 ...
## $ billAmt4 : num 0 3272 14331 28314 20940 ...
## $ billAmt5 : num 0 3455 14948 28959 19146 ...
## $ billAmt6 : num 0 3261 15549 29547 19131 ...
## $ payAmt1 : num 0 0 1518 2000 2000 ...
## $ payAmt2 : num 689 1000 1500 2019 36681 ...
## $ payAmt3 : num 0 1000 1000 1200 10000 657 38000 0 432 0 ...
## $ payAmt4 : num 0 1000 1000 1100 9000 ...
## $ payAmt5 : num 0 0 1000 1069 689 ...
## $ payAmt6 : num 0 2000 5000 1000 679 ...
## $ PaymentDefault: num 1 1 0 0 0 0 0 0 0 0 ...
# Analyze the balancedness of dependent variable
ggplot(defaultData, aes(x = PaymentDefault)) +
geom_histogram(stat = "count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
# Build logistic regression model
logitModelFull <- glm(PaymentDefault ~ limitBal + sex + education + marriage +
age + pay1 + pay2 + pay3 + pay4 + pay5 + pay6 + billAmt1 +
billAmt2 + billAmt3 + billAmt4 + billAmt5 + billAmt6 + payAmt1 +
payAmt2 + payAmt3 + payAmt4 + payAmt5 + payAmt6,
family = "binomial", data = defaultData)
# Take a look at the model
summary(logitModelFull)
##
## Call:
## glm(formula = PaymentDefault ~ limitBal + sex + education + marriage +
## age + pay1 + pay2 + pay3 + pay4 + pay5 + pay6 + billAmt1 +
## billAmt2 + billAmt3 + billAmt4 + billAmt5 + billAmt6 + payAmt1 +
## payAmt2 + payAmt3 + payAmt4 + payAmt5 + payAmt6, family = "binomial",
## data = defaultData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0893 -0.7116 -0.5615 -0.2794 4.2501
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.711e-01 1.505e-01 -3.795 0.000148 ***
## limitBal -4.825e-07 1.985e-07 -2.431 0.015052 *
## sex -8.251e-02 3.880e-02 -2.127 0.033457 *
## education -1.217e-01 2.745e-02 -4.434 9.23e-06 ***
## marriage -1.711e-01 4.016e-02 -4.259 2.05e-05 ***
## age 4.824e-03 2.257e-03 2.137 0.032570 *
## pay1 5.743e-01 2.221e-02 25.864 < 2e-16 ***
## pay2 5.156e-02 2.552e-02 2.020 0.043336 *
## pay3 7.811e-02 2.863e-02 2.728 0.006375 **
## pay4 -1.191e-02 3.285e-02 -0.363 0.716838
## pay5 1.080e-01 3.381e-02 3.193 0.001406 **
## pay6 -1.956e-02 2.750e-02 -0.711 0.476852
## billAmt1 -7.948e-06 1.582e-06 -5.023 5.09e-07 ***
## billAmt2 4.911e-06 2.006e-06 2.448 0.014350 *
## billAmt3 4.203e-07 1.698e-06 0.247 0.804572
## billAmt4 -1.587e-08 1.872e-06 -0.008 0.993234
## billAmt5 9.703e-07 2.154e-06 0.451 0.652293
## billAmt6 6.758e-07 1.591e-06 0.425 0.670955
## payAmt1 -1.878e-05 3.252e-06 -5.777 7.61e-09 ***
## payAmt2 -6.406e-06 2.364e-06 -2.710 0.006731 **
## payAmt3 -3.325e-06 2.401e-06 -1.385 0.166153
## payAmt4 -3.922e-06 2.342e-06 -1.675 0.093970 .
## payAmt5 -2.383e-06 2.168e-06 -1.099 0.271635
## payAmt6 -1.916e-06 1.618e-06 -1.184 0.236521
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19438 on 17999 degrees of freedom
## Residual deviance: 17216 on 17976 degrees of freedom
## AIC: 17264
##
## Number of Fisher Scoring iterations: 5
# Take a look at the odds
coefsexp <- coef(logitModelFull) %>% exp() %>% round(2)
coefsexp
## (Intercept) limitBal sex education marriage age
## 0.56 1.00 0.92 0.89 0.84 1.00
## pay1 pay2 pay3 pay4 pay5 pay6
## 1.78 1.05 1.08 0.99 1.11 0.98
## billAmt1 billAmt2 billAmt3 billAmt4 billAmt5 billAmt6
## 1.00 1.00 1.00 1.00 1.00 1.00
## payAmt1 payAmt2 payAmt3 payAmt4 payAmt5 payAmt6
## 1.00 1.00 1.00 1.00 1.00 1.00
# The old (full) model
logitModelFull <- glm(PaymentDefault ~ limitBal + sex + education + marriage +
age + pay1 + pay2 + pay3 + pay4 + pay5 + pay6 + billAmt1 +
billAmt2 + billAmt3 + billAmt4 + billAmt5 + billAmt6 + payAmt1 +
payAmt2 + payAmt3 + payAmt4 + payAmt5 + payAmt6,
family = binomial, defaultData)
#Build the new model
logitModelNew <- MASS::stepAIC(logitModelFull, trace=0)
#Look at the model
summary(logitModelNew)
##
## Call:
## glm(formula = PaymentDefault ~ limitBal + sex + education + marriage +
## age + pay1 + pay2 + pay3 + pay5 + billAmt1 + billAmt2 + billAmt5 +
## payAmt1 + payAmt2 + payAmt3 + payAmt4, family = binomial,
## data = defaultData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0839 -0.7119 -0.5611 -0.2839 4.1800
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.699e-01 1.504e-01 -3.790 0.000151 ***
## limitBal -5.201e-07 1.954e-07 -2.661 0.007791 **
## sex -8.206e-02 3.878e-02 -2.116 0.034338 *
## education -1.212e-01 2.744e-02 -4.418 9.96e-06 ***
## marriage -1.724e-01 4.014e-02 -4.295 1.75e-05 ***
## age 4.863e-03 2.256e-03 2.156 0.031092 *
## pay1 5.740e-01 2.218e-02 25.882 < 2e-16 ***
## pay2 4.979e-02 2.552e-02 1.951 0.051048 .
## pay3 7.197e-02 2.573e-02 2.798 0.005146 **
## pay5 8.859e-02 2.249e-02 3.938 8.20e-05 ***
## billAmt1 -8.130e-06 1.580e-06 -5.144 2.69e-07 ***
## billAmt2 5.238e-06 1.775e-06 2.951 0.003165 **
## billAmt5 1.790e-06 8.782e-07 2.038 0.041554 *
## payAmt1 -1.931e-05 3.258e-06 -5.928 3.06e-09 ***
## payAmt2 -6.572e-06 2.092e-06 -3.142 0.001681 **
## payAmt3 -3.693e-06 2.187e-06 -1.689 0.091241 .
## payAmt4 -4.611e-06 2.062e-06 -2.237 0.025306 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19438 on 17999 degrees of freedom
## Residual deviance: 17220 on 17983 degrees of freedom
## AIC: 17254
##
## Number of Fisher Scoring iterations: 5
# Save the formula of the new model (it will be needed for the out-of-sample part)
formulaLogit <- as.formula(summary(logitModelNew)$call)
formulaLogit
## PaymentDefault ~ limitBal + sex + education + marriage + age +
## pay1 + pay2 + pay3 + pay5 + billAmt1 + billAmt2 + billAmt5 +
## payAmt1 + payAmt2 + payAmt3 + payAmt4
# Make predictions using the full Model
defaultData$predFull <- predict(logitModelFull, type = "response", na.action = na.exclude)
# Construct the in-sample confusion matrix
confMatrixModelFull <- SDMTools::confusion.matrix(defaultData$PaymentDefault,
defaultData$predFull,
threshold = 0.5
)
confMatrixModelFull
## obs
## pred 0 1
## 0 13441 3154
## 1 409 996
## attr(,"class")
## [1] "confusion.matrix"
# Calculate the accuracy for the full Model
accuracyFull <- sum(diag(confMatrixModelFull)) / sum(confMatrixModelFull)
accuracyFull
## [1] 0.8020556
# Calculate the accuracy for 'logitModelNew'
# Make prediction
defaultData$predNew <- predict(logitModelNew, type = "response", na.action = na.exclude)
# Construct the in-sample confusion matrix
confMatrixModelNew <- SDMTools::confusion.matrix(defaultData$PaymentDefault,
defaultData$predNew,
threshold = 0.5
)
confMatrixModelNew
## obs
## pred 0 1
## 0 13443 3152
## 1 407 998
## attr(,"class")
## [1] "confusion.matrix"
# Calculate the accuracy...
accuracyNew <- sum(diag(confMatrixModelNew)) / sum(confMatrixModelNew)
accuracyNew
## [1] 0.8022778
# and compare it to the full model's accuracy
accuracyFull
## [1] 0.8020556
accuracyNew
## [1] 0.8022778
# Prepare data frame with threshold values and empty payoff column
payoffMatrix <- data.frame(threshold = seq(from = 0.1, to = 0.5, by = 0.1), payoff = NA)
payoffMatrix
## threshold payoff
## 1 0.1 NA
## 2 0.2 NA
## 3 0.3 NA
## 4 0.4 NA
## 5 0.5 NA
for(i in 1:length(payoffMatrix$threshold)) {
# Calculate confusion matrix with varying threshold
confMatrix <- SDMTools::confusion.matrix(defaultData$PaymentDefault,
defaultData$predNew,
threshold = payoffMatrix$threshold[i]
)
# Calculate payoff and save it to the corresponding row
payoffMatrix$payoff[i] <- confMatrix[1, 1]*250 + confMatrix[1, 2]*(-1000)
}
payoffMatrix
## threshold payoff
## 1 0.1 306750
## 2 0.2 752750
## 3 0.3 888000
## 4 0.4 641250
## 5 0.5 208750
# Split data in train and test set
set.seed(534381)
defaultData$isTrain <- rbinom(nrow(defaultData), 1, 0.66)
train <- subset(defaultData, isTrain == 1)
test <- subset(defaultData, isTrain == 0)
logitTrainNew <- glm(formulaLogit, family = binomial, data = train) # Modeling
test$predNew <- predict(logitTrainNew, type = "response", newdata = test) # Predictions
# Out-of-sample confusion matrix and accuracy
confMatrixModelNew <- SDMTools::confusion.matrix(test$PaymentDefault, test$predNew, threshold = 0.3)
sum(diag(confMatrixModelNew)) / sum(confMatrixModelNew) # Compare this value to the in-sample accuracy
## [1] 0.7797764
# Accuracy function
costAcc <- function(r, pi = 0) {
cm <- SDMTools::confusion.matrix(r, pi, threshold = 0.3)
acc <- sum(diag(cm)) / sum(cm)
return(acc)
}
# Cross validated accuracy for logitModelNew
set.seed(534381)
boot::cv.glm(defaultData, logitModelNew, cost = costAcc, K = 6)$delta[1]
## [1] 0.7862778
Chapter 3 - Modeling Time to Reorder with Survival Analysis
Survival Analysis Introduction:
Survival curve analysis by Kaplan-Meier:
Cox PH model with constant covariates:
Checking model assumptions and making predictions:
Example code includes:
survData <- readr::read_delim("./RInputFiles/survivalDataExercise.csv", delim=",")
## Parsed with column specification:
## cols(
## daysSinceFirstPurch = col_double(),
## shoppingCartValue = col_double(),
## gender = col_character(),
## voucher = col_double(),
## returned = col_double(),
## boughtAgain = col_double()
## )
dataNextOrder <- survData %>%
select(daysSinceFirstPurch, boughtAgain)
# Look at the head of the data
head(dataNextOrder)
## # A tibble: 6 x 2
## daysSinceFirstPurch boughtAgain
## <dbl> <dbl>
## 1 37 0
## 2 63 1
## 3 48 0
## 4 17 1
## 5 53 0
## 6 11 1
# Plot a histogram
ggplot(dataNextOrder) +
geom_histogram(aes(x = daysSinceFirstPurch, fill = factor(boughtAgain))) +
facet_grid( ~ boughtAgain) + # Separate plots for boughtAgain = 1 vs. 0
theme(legend.position = "none") # Don't show legend
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Create survival object
survObj <- survival::Surv(dataNextOrder$daysSinceFirstPurch, dataNextOrder$boughtAgain)
# Look at structure
str(survObj)
## 'Surv' num [1:5122, 1:2] 37+ 63 48+ 17 53+ 11 22 16 74+ 44 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:2] "time" "status"
## - attr(*, "type")= chr "right"
# Compute and print fit
fitKMSimple <- survival::survfit(survObj ~ 1)
print(fitKMSimple)
## Call: survfit(formula = survObj ~ 1)
##
## n events median 0.95LCL 0.95UCL
## 5122 3199 41 40 42
# Plot fit
plot(fitKMSimple, conf.int = FALSE, xlab = "Time since first purchase",
ylab = "Survival function", main = "Survival function"
)
dataNextOrder <- survData %>%
select(daysSinceFirstPurch, boughtAgain, voucher)
# Compute fit with categorical covariate
fitKMCov <- survival::survfit(survObj ~ voucher, data = dataNextOrder)
# Plot fit with covariate and add labels
plot(fitKMCov, lty = 2:3, xlab = "Time since first purchase",
ylab = "Survival function", main = "Survival function"
)
legend(90, .9, c("No", "Yes"), lty = 2:3)
dataNextOrder <- survData
# Determine distributions of predictor variables
dd <- rms::datadist(dataNextOrder)
options(datadist = "dd")
# Compute Cox PH Model and print results
fitCPH <- rms::cph(survival::Surv(daysSinceFirstPurch, boughtAgain) ~
shoppingCartValue + voucher + returned + gender, data = dataNextOrder,
x = TRUE, y = TRUE, surv = TRUE
)
print(fitCPH)
## Cox Proportional Hazards Model
##
## rms::cph(formula = survival::Surv(daysSinceFirstPurch, boughtAgain) ~
## shoppingCartValue + voucher + returned + gender, data = dataNextOrder,
## x = TRUE, y = TRUE, surv = TRUE)
##
## Model Tests Discrimination
## Indexes
## Obs 5122 LR chi2 155.68 R2 0.030
## Events 3199 d.f. 4 Dxy 0.116
## Center -0.2808 Pr(> chi2) 0.0000 g 0.238
## Score chi2 140.57 gr 1.269
## Pr(> chi2) 0.0000
##
## Coef S.E. Wald Z Pr(>|Z|)
## shoppingCartValue -0.0021 0.0003 -7.56 <0.0001
## voucher -0.2945 0.0480 -6.14 <0.0001
## returned -0.3145 0.0495 -6.36 <0.0001
## gender=male 0.1080 0.0363 2.97 0.0029
##
# Interpret coefficients
exp(fitCPH$coefficients)
## shoppingCartValue voucher returned gender=male
## 0.9978601 0.7449362 0.7301667 1.1140891
# Plot result summary
plot(summary(fitCPH), log = TRUE)
# Check proportional hazard assumption and print result
testCPH <- survival::cox.zph(fitCPH)
print(testCPH)
## chisq df p
## shoppingCartValue 0.350 1 0.554
## voucher 0.804 1 0.370
## returned 2.584 1 0.108
## gender 4.669 1 0.031
## GLOBAL 8.453 4 0.076
# Plot time-dependent beta
plot(testCPH, var = "gender")
# Validate model
rms::validate(fitCPH, method = "crossvalidation", B = 10, dxy = TRUE, pr = FALSE)
## index.orig training test optimism index.corrected n
## Dxy 0.1159 0.1160 0.1145 0.0014 0.1144 10
## R2 0.0299 0.0300 0.0288 0.0013 0.0287 10
## Slope 1.0000 1.0000 0.9733 0.0267 0.9733 10
## D 0.0032 0.0033 0.0042 -0.0009 0.0041 10
## U 0.0000 0.0000 0.0002 -0.0002 0.0002 10
## Q 0.0032 0.0033 0.0040 -0.0007 0.0040 10
## g 0.2380 0.2382 0.2320 0.0062 0.2318 10
# Create data with new customer
newCustomer <- data.frame(daysSinceFirstPurch = 21, shoppingCartValue = 99.9, gender = "female",
voucher = 1, returned = 0, stringsAsFactors = FALSE
)
# Make predictions
pred <- survival::survfit(fitCPH, newdata = newCustomer)
print(pred)
## Call: survfit(formula = fitCPH, newdata = newCustomer)
##
## n events median 0.95LCL 0.95UCL
## 5122 3199 47 44 49
plot(pred)
# Correct the customer's gender
newCustomer2 <- newCustomer
newCustomer2$gender <- "male"
# Redo prediction
pred2 <- survival::survfit(fitCPH, newdata = newCustomer2)
print(pred2)
## Call: survfit(formula = fitCPH, newdata = newCustomer2)
##
## n events median 0.95LCL 0.95UCL
## 5122 3199 44 42 47
Chapter 4 - Reducing Dimensionality with Principal Component Analysis
PCA for CRM Data - address mutlicollinearity and data volume issues in the raw CRM data:
PCA Computation:
PCA Model Specification:
Principal components in a regression analysis:
Wrap up:
Example code includes:
load("./RInputFiles/newsData.RData")
rawData <- newsData
newsData <- newsData[, c('n_tokens_title', 'n_tokens_content', 'n_unique_tokens', 'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos', 'num_keywords', 'is_weekend', 'kw_avg_min', 'kw_avg_avg', 'kw_avg_max', 'average_token_length', 'global_subjectivity', 'global_sentiment_polarity', 'global_rate_positive_words', 'global_rate_negative_words', 'avg_positive_polarity', 'avg_negative_polarity', 'title_subjectivity', 'title_sentiment_polarity')]
# Overview of data structure:
str(newsData, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 39644 obs. of 21 variables:
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words: num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words: num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
# Correlation structure:
newsData %>% cor() %>% corrplot::corrplot()
# Standardize data
newsData <- newsData %>% scale() %>% as.data.frame()
# Compute PCA
pcaNews <- newsData %>% prcomp()
# Eigenvalues
pcaNews$sdev**2
## [1] 3.31015107 2.00241491 1.82662819 1.67421238 1.30249854 1.20443028
## [7] 1.02889482 1.00052438 0.97929267 0.95905061 0.82676492 0.74951891
## [13] 0.73162009 0.66351863 0.62319656 0.57949073 0.47020594 0.41516936
## [19] 0.29926492 0.27690363 0.07624847
# Screeplot:
screeplot(pcaNews, type = "lines")
# Cumulative explained variance:
summary(pcaNews)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.8194 1.41507 1.35153 1.29391 1.14127 1.09747 1.01434
## Proportion of Variance 0.1576 0.09535 0.08698 0.07972 0.06202 0.05735 0.04899
## Cumulative Proportion 0.1576 0.25298 0.33996 0.41969 0.48171 0.53906 0.58806
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 1.00026 0.98959 0.97931 0.90927 0.86575 0.85535 0.8146
## Proportion of Variance 0.04764 0.04663 0.04567 0.03937 0.03569 0.03484 0.0316
## Cumulative Proportion 0.63570 0.68234 0.72800 0.76737 0.80307 0.83790 0.8695
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.78943 0.76124 0.68572 0.64434 0.54705 0.52622 0.27613
## Proportion of Variance 0.02968 0.02759 0.02239 0.01977 0.01425 0.01319 0.00363
## Cumulative Proportion 0.89918 0.92677 0.94916 0.96893 0.98318 0.99637 1.00000
# Kaiser-Guttmann (number of components with eigenvalue larger than 1):
sum(pcaNews$sdev > 1)
## [1] 8
# Print loadings of the first six components
pcaNews$rotation[, 1:6] %>% round(2)
## PC1 PC2 PC3 PC4 PC5 PC6
## n_tokens_title -0.05 -0.10 0.01 -0.10 0.20 -0.28
## n_tokens_content 0.23 -0.17 -0.38 0.12 0.15 -0.02
## n_unique_tokens 0.00 0.00 0.00 0.01 0.01 0.06
## num_hrefs 0.26 -0.16 -0.42 -0.03 0.07 0.11
## num_self_hrefs 0.20 -0.07 -0.39 0.06 0.12 0.08
## num_imgs 0.14 -0.15 -0.43 -0.06 0.04 0.08
## num_videos 0.09 -0.20 0.04 -0.19 0.16 -0.14
## num_keywords 0.07 0.11 -0.25 0.14 -0.42 -0.30
## is_weekend 0.05 -0.01 -0.12 -0.02 -0.10 -0.16
## kw_avg_min 0.03 0.01 -0.05 -0.25 -0.65 0.07
## kw_avg_avg 0.02 -0.15 -0.06 -0.61 -0.31 0.17
## kw_avg_max -0.10 -0.21 0.10 -0.50 0.35 0.26
## average_token_length 0.39 -0.02 0.19 0.19 -0.01 0.14
## global_subjectivity 0.45 -0.01 0.23 -0.04 -0.03 0.03
## global_sentiment_polarity 0.25 0.55 -0.03 -0.19 0.11 0.13
## global_rate_positive_words 0.33 0.25 0.14 -0.08 0.04 -0.09
## global_rate_negative_words 0.15 -0.47 0.23 0.11 -0.10 -0.21
## avg_positive_polarity 0.42 0.09 0.17 -0.06 0.02 0.10
## avg_negative_polarity -0.25 0.37 -0.20 -0.04 0.08 0.06
## title_subjectivity 0.07 -0.03 0.01 -0.27 0.07 -0.61
## title_sentiment_polarity 0.07 0.24 -0.11 -0.24 0.15 -0.42
pcaNews %>% biplot(choices=1:2, cex = 0.5)
# Predict log shares with all original variables
logShares <- rawData %>%
select(shares) %>%
mutate(logShares=log(1+shares)) %>%
pull(logShares) %>%
scale()
newsData <- newsData %>%
cbind(logShares)
mod1 <- lm(logShares ~ ., data = newsData)
# Create dataframe with log shares and first 6 components
dataNewsComponents <- cbind(logShares = newsData[, "logShares"], pcaNews$x[, 1:6]) %>%
as.data.frame()
# Predict log shares with first six components
mod2 <- lm(logShares ~ ., data = dataNewsComponents)
# Print adjusted R squared for both models
summary(mod1)$adj.r.squared
## [1] 0.07954578
summary(mod2)$adj.r.squared
## [1] 0.05066316
Chapter 1 - Setting Up Interactive Web Maps
Introduction to leaflet - open-source JavaScript library that makes interactive, mobile-friendly maps:
Map tiles - over 100 pre-canned maps that are available as bases:
Setting the default map view:
Plotting DataCamp HQ:
Example code includes:
# Load the leaflet library
library(leaflet)
##
## Attaching package: 'leaflet'
## The following object is masked from 'package:xts':
##
## addLegend
# Create a leaflet map with default map tile using addTiles()
leaflet() %>%
addTiles()
# Print the providers list included in the leaflet library
providers
## $OpenStreetMap
## [1] "OpenStreetMap"
##
## $OpenStreetMap.Mapnik
## [1] "OpenStreetMap.Mapnik"
##
## $OpenStreetMap.DE
## [1] "OpenStreetMap.DE"
##
## $OpenStreetMap.CH
## [1] "OpenStreetMap.CH"
##
## $OpenStreetMap.France
## [1] "OpenStreetMap.France"
##
## $OpenStreetMap.HOT
## [1] "OpenStreetMap.HOT"
##
## $OpenStreetMap.BZH
## [1] "OpenStreetMap.BZH"
##
## $OpenSeaMap
## [1] "OpenSeaMap"
##
## $OpenPtMap
## [1] "OpenPtMap"
##
## $OpenTopoMap
## [1] "OpenTopoMap"
##
## $OpenRailwayMap
## [1] "OpenRailwayMap"
##
## $OpenFireMap
## [1] "OpenFireMap"
##
## $SafeCast
## [1] "SafeCast"
##
## $Thunderforest
## [1] "Thunderforest"
##
## $Thunderforest.OpenCycleMap
## [1] "Thunderforest.OpenCycleMap"
##
## $Thunderforest.Transport
## [1] "Thunderforest.Transport"
##
## $Thunderforest.TransportDark
## [1] "Thunderforest.TransportDark"
##
## $Thunderforest.SpinalMap
## [1] "Thunderforest.SpinalMap"
##
## $Thunderforest.Landscape
## [1] "Thunderforest.Landscape"
##
## $Thunderforest.Outdoors
## [1] "Thunderforest.Outdoors"
##
## $Thunderforest.Pioneer
## [1] "Thunderforest.Pioneer"
##
## $Thunderforest.MobileAtlas
## [1] "Thunderforest.MobileAtlas"
##
## $Thunderforest.Neighbourhood
## [1] "Thunderforest.Neighbourhood"
##
## $OpenMapSurfer
## [1] "OpenMapSurfer"
##
## $OpenMapSurfer.Roads
## [1] "OpenMapSurfer.Roads"
##
## $OpenMapSurfer.Hybrid
## [1] "OpenMapSurfer.Hybrid"
##
## $OpenMapSurfer.AdminBounds
## [1] "OpenMapSurfer.AdminBounds"
##
## $OpenMapSurfer.ContourLines
## [1] "OpenMapSurfer.ContourLines"
##
## $OpenMapSurfer.Hillshade
## [1] "OpenMapSurfer.Hillshade"
##
## $OpenMapSurfer.ElementsAtRisk
## [1] "OpenMapSurfer.ElementsAtRisk"
##
## $Hydda
## [1] "Hydda"
##
## $Hydda.Full
## [1] "Hydda.Full"
##
## $Hydda.Base
## [1] "Hydda.Base"
##
## $Hydda.RoadsAndLabels
## [1] "Hydda.RoadsAndLabels"
##
## $MapBox
## [1] "MapBox"
##
## $Stamen
## [1] "Stamen"
##
## $Stamen.Toner
## [1] "Stamen.Toner"
##
## $Stamen.TonerBackground
## [1] "Stamen.TonerBackground"
##
## $Stamen.TonerHybrid
## [1] "Stamen.TonerHybrid"
##
## $Stamen.TonerLines
## [1] "Stamen.TonerLines"
##
## $Stamen.TonerLabels
## [1] "Stamen.TonerLabels"
##
## $Stamen.TonerLite
## [1] "Stamen.TonerLite"
##
## $Stamen.Watercolor
## [1] "Stamen.Watercolor"
##
## $Stamen.Terrain
## [1] "Stamen.Terrain"
##
## $Stamen.TerrainBackground
## [1] "Stamen.TerrainBackground"
##
## $Stamen.TerrainLabels
## [1] "Stamen.TerrainLabels"
##
## $Stamen.TopOSMRelief
## [1] "Stamen.TopOSMRelief"
##
## $Stamen.TopOSMFeatures
## [1] "Stamen.TopOSMFeatures"
##
## $TomTom
## [1] "TomTom"
##
## $TomTom.Basic
## [1] "TomTom.Basic"
##
## $TomTom.Hybrid
## [1] "TomTom.Hybrid"
##
## $TomTom.Labels
## [1] "TomTom.Labels"
##
## $Esri
## [1] "Esri"
##
## $Esri.WorldStreetMap
## [1] "Esri.WorldStreetMap"
##
## $Esri.DeLorme
## [1] "Esri.DeLorme"
##
## $Esri.WorldTopoMap
## [1] "Esri.WorldTopoMap"
##
## $Esri.WorldImagery
## [1] "Esri.WorldImagery"
##
## $Esri.WorldTerrain
## [1] "Esri.WorldTerrain"
##
## $Esri.WorldShadedRelief
## [1] "Esri.WorldShadedRelief"
##
## $Esri.WorldPhysical
## [1] "Esri.WorldPhysical"
##
## $Esri.OceanBasemap
## [1] "Esri.OceanBasemap"
##
## $Esri.NatGeoWorldMap
## [1] "Esri.NatGeoWorldMap"
##
## $Esri.WorldGrayCanvas
## [1] "Esri.WorldGrayCanvas"
##
## $OpenWeatherMap
## [1] "OpenWeatherMap"
##
## $OpenWeatherMap.Clouds
## [1] "OpenWeatherMap.Clouds"
##
## $OpenWeatherMap.CloudsClassic
## [1] "OpenWeatherMap.CloudsClassic"
##
## $OpenWeatherMap.Precipitation
## [1] "OpenWeatherMap.Precipitation"
##
## $OpenWeatherMap.PrecipitationClassic
## [1] "OpenWeatherMap.PrecipitationClassic"
##
## $OpenWeatherMap.Rain
## [1] "OpenWeatherMap.Rain"
##
## $OpenWeatherMap.RainClassic
## [1] "OpenWeatherMap.RainClassic"
##
## $OpenWeatherMap.Pressure
## [1] "OpenWeatherMap.Pressure"
##
## $OpenWeatherMap.PressureContour
## [1] "OpenWeatherMap.PressureContour"
##
## $OpenWeatherMap.Wind
## [1] "OpenWeatherMap.Wind"
##
## $OpenWeatherMap.Temperature
## [1] "OpenWeatherMap.Temperature"
##
## $OpenWeatherMap.Snow
## [1] "OpenWeatherMap.Snow"
##
## $HERE
## [1] "HERE"
##
## $HERE.normalDay
## [1] "HERE.normalDay"
##
## $HERE.normalDayCustom
## [1] "HERE.normalDayCustom"
##
## $HERE.normalDayGrey
## [1] "HERE.normalDayGrey"
##
## $HERE.normalDayMobile
## [1] "HERE.normalDayMobile"
##
## $HERE.normalDayGreyMobile
## [1] "HERE.normalDayGreyMobile"
##
## $HERE.normalDayTransit
## [1] "HERE.normalDayTransit"
##
## $HERE.normalDayTransitMobile
## [1] "HERE.normalDayTransitMobile"
##
## $HERE.normalDayTraffic
## [1] "HERE.normalDayTraffic"
##
## $HERE.normalNight
## [1] "HERE.normalNight"
##
## $HERE.normalNightMobile
## [1] "HERE.normalNightMobile"
##
## $HERE.normalNightGrey
## [1] "HERE.normalNightGrey"
##
## $HERE.normalNightGreyMobile
## [1] "HERE.normalNightGreyMobile"
##
## $HERE.normalNightTransit
## [1] "HERE.normalNightTransit"
##
## $HERE.normalNightTransitMobile
## [1] "HERE.normalNightTransitMobile"
##
## $HERE.reducedDay
## [1] "HERE.reducedDay"
##
## $HERE.reducedNight
## [1] "HERE.reducedNight"
##
## $HERE.basicMap
## [1] "HERE.basicMap"
##
## $HERE.mapLabels
## [1] "HERE.mapLabels"
##
## $HERE.trafficFlow
## [1] "HERE.trafficFlow"
##
## $HERE.carnavDayGrey
## [1] "HERE.carnavDayGrey"
##
## $HERE.hybridDay
## [1] "HERE.hybridDay"
##
## $HERE.hybridDayMobile
## [1] "HERE.hybridDayMobile"
##
## $HERE.hybridDayTransit
## [1] "HERE.hybridDayTransit"
##
## $HERE.hybridDayGrey
## [1] "HERE.hybridDayGrey"
##
## $HERE.hybridDayTraffic
## [1] "HERE.hybridDayTraffic"
##
## $HERE.pedestrianDay
## [1] "HERE.pedestrianDay"
##
## $HERE.pedestrianNight
## [1] "HERE.pedestrianNight"
##
## $HERE.satelliteDay
## [1] "HERE.satelliteDay"
##
## $HERE.terrainDay
## [1] "HERE.terrainDay"
##
## $HERE.terrainDayMobile
## [1] "HERE.terrainDayMobile"
##
## $FreeMapSK
## [1] "FreeMapSK"
##
## $MtbMap
## [1] "MtbMap"
##
## $CartoDB
## [1] "CartoDB"
##
## $CartoDB.Positron
## [1] "CartoDB.Positron"
##
## $CartoDB.PositronNoLabels
## [1] "CartoDB.PositronNoLabels"
##
## $CartoDB.PositronOnlyLabels
## [1] "CartoDB.PositronOnlyLabels"
##
## $CartoDB.DarkMatter
## [1] "CartoDB.DarkMatter"
##
## $CartoDB.DarkMatterNoLabels
## [1] "CartoDB.DarkMatterNoLabels"
##
## $CartoDB.DarkMatterOnlyLabels
## [1] "CartoDB.DarkMatterOnlyLabels"
##
## $CartoDB.Voyager
## [1] "CartoDB.Voyager"
##
## $CartoDB.VoyagerNoLabels
## [1] "CartoDB.VoyagerNoLabels"
##
## $CartoDB.VoyagerOnlyLabels
## [1] "CartoDB.VoyagerOnlyLabels"
##
## $CartoDB.VoyagerLabelsUnder
## [1] "CartoDB.VoyagerLabelsUnder"
##
## $HikeBike
## [1] "HikeBike"
##
## $HikeBike.HikeBike
## [1] "HikeBike.HikeBike"
##
## $HikeBike.HillShading
## [1] "HikeBike.HillShading"
##
## $BasemapAT
## [1] "BasemapAT"
##
## $BasemapAT.basemap
## [1] "BasemapAT.basemap"
##
## $BasemapAT.grau
## [1] "BasemapAT.grau"
##
## $BasemapAT.overlay
## [1] "BasemapAT.overlay"
##
## $BasemapAT.highdpi
## [1] "BasemapAT.highdpi"
##
## $BasemapAT.orthofoto
## [1] "BasemapAT.orthofoto"
##
## $nlmaps
## [1] "nlmaps"
##
## $nlmaps.standaard
## [1] "nlmaps.standaard"
##
## $nlmaps.pastel
## [1] "nlmaps.pastel"
##
## $nlmaps.grijs
## [1] "nlmaps.grijs"
##
## $nlmaps.luchtfoto
## [1] "nlmaps.luchtfoto"
##
## $NASAGIBS
## [1] "NASAGIBS"
##
## $NASAGIBS.ModisTerraTrueColorCR
## [1] "NASAGIBS.ModisTerraTrueColorCR"
##
## $NASAGIBS.ModisTerraBands367CR
## [1] "NASAGIBS.ModisTerraBands367CR"
##
## $NASAGIBS.ViirsEarthAtNight2012
## [1] "NASAGIBS.ViirsEarthAtNight2012"
##
## $NASAGIBS.ModisTerraLSTDay
## [1] "NASAGIBS.ModisTerraLSTDay"
##
## $NASAGIBS.ModisTerraSnowCover
## [1] "NASAGIBS.ModisTerraSnowCover"
##
## $NASAGIBS.ModisTerraAOD
## [1] "NASAGIBS.ModisTerraAOD"
##
## $NASAGIBS.ModisTerraChlorophyll
## [1] "NASAGIBS.ModisTerraChlorophyll"
##
## $NLS
## [1] "NLS"
##
## $JusticeMap
## [1] "JusticeMap"
##
## $JusticeMap.income
## [1] "JusticeMap.income"
##
## $JusticeMap.americanIndian
## [1] "JusticeMap.americanIndian"
##
## $JusticeMap.asian
## [1] "JusticeMap.asian"
##
## $JusticeMap.black
## [1] "JusticeMap.black"
##
## $JusticeMap.hispanic
## [1] "JusticeMap.hispanic"
##
## $JusticeMap.multi
## [1] "JusticeMap.multi"
##
## $JusticeMap.nonWhite
## [1] "JusticeMap.nonWhite"
##
## $JusticeMap.white
## [1] "JusticeMap.white"
##
## $JusticeMap.plurality
## [1] "JusticeMap.plurality"
##
## $Wikimedia
## [1] "Wikimedia"
##
## $GeoportailFrance
## [1] "GeoportailFrance"
##
## $GeoportailFrance.parcels
## [1] "GeoportailFrance.parcels"
##
## $GeoportailFrance.ignMaps
## [1] "GeoportailFrance.ignMaps"
##
## $GeoportailFrance.maps
## [1] "GeoportailFrance.maps"
##
## $GeoportailFrance.orthos
## [1] "GeoportailFrance.orthos"
##
## $OneMapSG
## [1] "OneMapSG"
##
## $OneMapSG.Default
## [1] "OneMapSG.Default"
##
## $OneMapSG.Night
## [1] "OneMapSG.Night"
##
## $OneMapSG.Original
## [1] "OneMapSG.Original"
##
## $OneMapSG.Grey
## [1] "OneMapSG.Grey"
##
## $OneMapSG.LandLot
## [1] "OneMapSG.LandLot"
# Print only the names of the map tiles in the providers list
names(providers)
## [1] "OpenStreetMap"
## [2] "OpenStreetMap.Mapnik"
## [3] "OpenStreetMap.DE"
## [4] "OpenStreetMap.CH"
## [5] "OpenStreetMap.France"
## [6] "OpenStreetMap.HOT"
## [7] "OpenStreetMap.BZH"
## [8] "OpenSeaMap"
## [9] "OpenPtMap"
## [10] "OpenTopoMap"
## [11] "OpenRailwayMap"
## [12] "OpenFireMap"
## [13] "SafeCast"
## [14] "Thunderforest"
## [15] "Thunderforest.OpenCycleMap"
## [16] "Thunderforest.Transport"
## [17] "Thunderforest.TransportDark"
## [18] "Thunderforest.SpinalMap"
## [19] "Thunderforest.Landscape"
## [20] "Thunderforest.Outdoors"
## [21] "Thunderforest.Pioneer"
## [22] "Thunderforest.MobileAtlas"
## [23] "Thunderforest.Neighbourhood"
## [24] "OpenMapSurfer"
## [25] "OpenMapSurfer.Roads"
## [26] "OpenMapSurfer.Hybrid"
## [27] "OpenMapSurfer.AdminBounds"
## [28] "OpenMapSurfer.ContourLines"
## [29] "OpenMapSurfer.Hillshade"
## [30] "OpenMapSurfer.ElementsAtRisk"
## [31] "Hydda"
## [32] "Hydda.Full"
## [33] "Hydda.Base"
## [34] "Hydda.RoadsAndLabels"
## [35] "MapBox"
## [36] "Stamen"
## [37] "Stamen.Toner"
## [38] "Stamen.TonerBackground"
## [39] "Stamen.TonerHybrid"
## [40] "Stamen.TonerLines"
## [41] "Stamen.TonerLabels"
## [42] "Stamen.TonerLite"
## [43] "Stamen.Watercolor"
## [44] "Stamen.Terrain"
## [45] "Stamen.TerrainBackground"
## [46] "Stamen.TerrainLabels"
## [47] "Stamen.TopOSMRelief"
## [48] "Stamen.TopOSMFeatures"
## [49] "TomTom"
## [50] "TomTom.Basic"
## [51] "TomTom.Hybrid"
## [52] "TomTom.Labels"
## [53] "Esri"
## [54] "Esri.WorldStreetMap"
## [55] "Esri.DeLorme"
## [56] "Esri.WorldTopoMap"
## [57] "Esri.WorldImagery"
## [58] "Esri.WorldTerrain"
## [59] "Esri.WorldShadedRelief"
## [60] "Esri.WorldPhysical"
## [61] "Esri.OceanBasemap"
## [62] "Esri.NatGeoWorldMap"
## [63] "Esri.WorldGrayCanvas"
## [64] "OpenWeatherMap"
## [65] "OpenWeatherMap.Clouds"
## [66] "OpenWeatherMap.CloudsClassic"
## [67] "OpenWeatherMap.Precipitation"
## [68] "OpenWeatherMap.PrecipitationClassic"
## [69] "OpenWeatherMap.Rain"
## [70] "OpenWeatherMap.RainClassic"
## [71] "OpenWeatherMap.Pressure"
## [72] "OpenWeatherMap.PressureContour"
## [73] "OpenWeatherMap.Wind"
## [74] "OpenWeatherMap.Temperature"
## [75] "OpenWeatherMap.Snow"
## [76] "HERE"
## [77] "HERE.normalDay"
## [78] "HERE.normalDayCustom"
## [79] "HERE.normalDayGrey"
## [80] "HERE.normalDayMobile"
## [81] "HERE.normalDayGreyMobile"
## [82] "HERE.normalDayTransit"
## [83] "HERE.normalDayTransitMobile"
## [84] "HERE.normalDayTraffic"
## [85] "HERE.normalNight"
## [86] "HERE.normalNightMobile"
## [87] "HERE.normalNightGrey"
## [88] "HERE.normalNightGreyMobile"
## [89] "HERE.normalNightTransit"
## [90] "HERE.normalNightTransitMobile"
## [91] "HERE.reducedDay"
## [92] "HERE.reducedNight"
## [93] "HERE.basicMap"
## [94] "HERE.mapLabels"
## [95] "HERE.trafficFlow"
## [96] "HERE.carnavDayGrey"
## [97] "HERE.hybridDay"
## [98] "HERE.hybridDayMobile"
## [99] "HERE.hybridDayTransit"
## [100] "HERE.hybridDayGrey"
## [101] "HERE.hybridDayTraffic"
## [102] "HERE.pedestrianDay"
## [103] "HERE.pedestrianNight"
## [104] "HERE.satelliteDay"
## [105] "HERE.terrainDay"
## [106] "HERE.terrainDayMobile"
## [107] "FreeMapSK"
## [108] "MtbMap"
## [109] "CartoDB"
## [110] "CartoDB.Positron"
## [111] "CartoDB.PositronNoLabels"
## [112] "CartoDB.PositronOnlyLabels"
## [113] "CartoDB.DarkMatter"
## [114] "CartoDB.DarkMatterNoLabels"
## [115] "CartoDB.DarkMatterOnlyLabels"
## [116] "CartoDB.Voyager"
## [117] "CartoDB.VoyagerNoLabels"
## [118] "CartoDB.VoyagerOnlyLabels"
## [119] "CartoDB.VoyagerLabelsUnder"
## [120] "HikeBike"
## [121] "HikeBike.HikeBike"
## [122] "HikeBike.HillShading"
## [123] "BasemapAT"
## [124] "BasemapAT.basemap"
## [125] "BasemapAT.grau"
## [126] "BasemapAT.overlay"
## [127] "BasemapAT.highdpi"
## [128] "BasemapAT.orthofoto"
## [129] "nlmaps"
## [130] "nlmaps.standaard"
## [131] "nlmaps.pastel"
## [132] "nlmaps.grijs"
## [133] "nlmaps.luchtfoto"
## [134] "NASAGIBS"
## [135] "NASAGIBS.ModisTerraTrueColorCR"
## [136] "NASAGIBS.ModisTerraBands367CR"
## [137] "NASAGIBS.ViirsEarthAtNight2012"
## [138] "NASAGIBS.ModisTerraLSTDay"
## [139] "NASAGIBS.ModisTerraSnowCover"
## [140] "NASAGIBS.ModisTerraAOD"
## [141] "NASAGIBS.ModisTerraChlorophyll"
## [142] "NLS"
## [143] "JusticeMap"
## [144] "JusticeMap.income"
## [145] "JusticeMap.americanIndian"
## [146] "JusticeMap.asian"
## [147] "JusticeMap.black"
## [148] "JusticeMap.hispanic"
## [149] "JusticeMap.multi"
## [150] "JusticeMap.nonWhite"
## [151] "JusticeMap.white"
## [152] "JusticeMap.plurality"
## [153] "Wikimedia"
## [154] "GeoportailFrance"
## [155] "GeoportailFrance.parcels"
## [156] "GeoportailFrance.ignMaps"
## [157] "GeoportailFrance.maps"
## [158] "GeoportailFrance.orthos"
## [159] "OneMapSG"
## [160] "OneMapSG.Default"
## [161] "OneMapSG.Night"
## [162] "OneMapSG.Original"
## [163] "OneMapSG.Grey"
## [164] "OneMapSG.LandLot"
# Use str_detect() to determine if the name of each provider tile contains the string "CartoDB"
str_detect(names(providers), "CartoDB")
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# Use str_detect() to print only the provider tile names that include the string "CartoDB"
names(providers)[str_detect(names(providers), "CartoDB")]
## [1] "CartoDB" "CartoDB.Positron"
## [3] "CartoDB.PositronNoLabels" "CartoDB.PositronOnlyLabels"
## [5] "CartoDB.DarkMatter" "CartoDB.DarkMatterNoLabels"
## [7] "CartoDB.DarkMatterOnlyLabels" "CartoDB.Voyager"
## [9] "CartoDB.VoyagerNoLabels" "CartoDB.VoyagerOnlyLabels"
## [11] "CartoDB.VoyagerLabelsUnder"
# Change addTiles() to addProviderTiles() and set the provider argument to "CartoDB"
leaflet() %>%
addProviderTiles("CartoDB")
# Create a leaflet map that uses the Esri provider tile
leaflet() %>%
addProviderTiles("Esri")
# Create a leaflet map that uses the CartoDB.PositronNoLabels provider tile
leaflet() %>%
addProviderTiles("CartoDB.PositronNoLabels")
# Map with CartoDB tile centered on DataCamp's NYC office with zoom of 6
leaflet() %>%
addProviderTiles("CartoDB") %>%
setView(lng = -73.98575, lat = 40.74856, zoom = 6)
dc_hq <- tibble::tibble(hq=c("NYC", "Belgium"), lon=c(-73.98575, 4.71786), lat=c(40.7486, 50.8814))
dc_hq
## # A tibble: 2 x 3
## hq lon lat
## <chr> <dbl> <dbl>
## 1 NYC -74.0 40.7
## 2 Belgium 4.72 50.9
# Map with CartoDB.PositronNoLabels tile centered on DataCamp's Belgium office with zoom of 4
leaflet() %>%
addProviderTiles("CartoDB.PositronNoLabels") %>%
setView(lng = dc_hq$lon[2], lat = dc_hq$lat[2], zoom = 4)
leaflet(options = leafletOptions(
# Set minZoom and dragging
minZoom = 12, dragging = TRUE)) %>%
addProviderTiles("CartoDB") %>%
# Set default zoom level
setView(lng = dc_hq$lon[2], lat = dc_hq$lat[2], zoom = 14) %>%
# Set max bounds of map
setMaxBounds(lng1 = dc_hq$lon[2] + 0.05,
lat1 = dc_hq$lat[2] + .05,
lng2 = dc_hq$lon[2] - 0.05,
lat2 = dc_hq$lat[2] - .05)
# Plot DataCamp's NYC HQ
leaflet() %>%
addProviderTiles("CartoDB") %>%
addMarkers(lng = dc_hq$lon[1], lat = dc_hq$lat[1])
# Plot DataCamp's NYC HQ with zoom of 12
leaflet() %>%
addProviderTiles("CartoDB") %>%
addMarkers(lng = -73.98575, lat = 40.74856) %>%
setView(lng = -73.98575, lat = 40.74856, zoom = 12)
# Plot both DataCamp's NYC and Belgium locations
leaflet() %>%
addProviderTiles("CartoDB") %>%
addMarkers(lng = dc_hq$lon, lat = dc_hq$lat)
# Store leaflet hq map in an object called map
map <- leaflet() %>%
addProviderTiles("CartoDB") %>%
# add hq column of dc_hq as popups
addMarkers(lng = dc_hq$lon, lat = dc_hq$lat,
popup = dc_hq$hq
)
# Center the view of map on the Belgium HQ with a zoom of 5
map_zoom <- map %>%
setView(lat = 50.881363, lng = 4.717863, zoom = 5)
# Print map_zoom
map_zoom
Chapter 2 - Plotting points
Introduction to IPEDS Data:
Mapping California colleges:
Labels and pop-ups:
Color coding colleges:
Example code includes:
# Remove markers, reset bounds, and store the updated map in the m object
map <- map %>%
clearMarkers() %>%
clearBounds()
# Print the cleared map
map
ipedsRaw <- readr::read_csv("./RInputFiles/ipeds.csv")
## Parsed with column specification:
## cols(
## name = col_character(),
## lng = col_double(),
## lat = col_double(),
## state = col_character(),
## sector_label = col_character()
## )
# Remove colleges with missing sector information
ipeds <-
ipedsRaw %>%
tidyr::drop_na()
# Count the number of four-year colleges in each state
ipeds %>%
group_by(state) %>%
count()
## # A tibble: 56 x 2
## # Groups: state [56]
## state n
## <chr> <int>
## 1 AK 6
## 2 AL 45
## 3 AR 26
## 4 AS 1
## 5 AZ 50
## 6 CA 272
## 7 CO 53
## 8 CT 33
## 9 DC 18
## 10 DE 7
## # ... with 46 more rows
# Create a list of US States in descending order by the number of colleges in each state
ipeds %>%
group_by(state) %>%
count() %>%
arrange(desc(n))
## # A tibble: 56 x 2
## # Groups: state [56]
## state n
## <chr> <int>
## 1 CA 272
## 2 NY 239
## 3 PA 164
## 4 FL 159
## 5 TX 154
## 6 OH 135
## 7 IL 119
## 8 MA 103
## 9 MO 87
## 10 MN 82
## # ... with 46 more rows
# Create a dataframe called `ca` with data on only colleges in California
ca <- ipeds %>%
filter(state == "CA")
map <- leaflet() %>%
addProviderTiles("CartoDB")
# Use `addMarkers` to plot all of the colleges in `ca` on the `m` leaflet map
map %>%
addMarkers(lng = ca$lng, lat = ca$lat)
la_coords <- data.frame(lat = 34.05223, lon = -118.2437)
# Center the map on LA
map %>%
addMarkers(data = ca) %>%
setView(lat = la_coords$lat, lng = la_coords$lon, zoom = 12)
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Set the zoom level to 8 and store in the m object
map_zoom <-
map %>%
addMarkers(data = ca) %>%
setView(lat = la_coords$lat, lng = la_coords$lon, zoom = 8)
## Assuming "lng" and "lat" are longitude and latitude, respectively
map_zoom
# Clear the markers from the map
map2 <- map %>% clearMarkers()
# Use addCircleMarkers() to plot each college as a circle
map2 %>%
addCircleMarkers(lng = ca$lng, lat = ca$lat)
# Change the radius of each circle to be 2 pixels and the color to red
map2 %>%
addCircleMarkers(lng = ca$lng, lat = ca$lat, radius = 2, color = "red")
# Add circle markers with popups for college names
map %>%
addCircleMarkers(data = ca, radius = 2, popup = ~name)
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Change circle color to #2cb42c and store map in map_color object
map_color <- map %>%
addCircleMarkers(data = ca, radius = 2, color = "#2cb42c", popup = ~name)
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Print map_color
map_color
# Clear the bounds and markers on the map object and store in map2
map2 <- map %>%
clearBounds() %>%
clearMarkers()
# Add circle markers with popups that display both the institution name and sector
map2 %>%
addCircleMarkers(data = ca, radius = 2,
popup = ~paste0(name, "<br/>", sector_label)
)
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Make the institution name in each popup bold
map2 %>%
addCircleMarkers(data = ca, radius = 2,
popup = ~paste0("<b>", name, "</b>", "<br/>", sector_label)
)
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Add circle markers with labels identifying the name of each college
map %>%
addCircleMarkers(data = ca, radius = 2, label = ~name)
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Use paste0 to add sector information to the label inside parentheses
map %>%
addCircleMarkers(data = ca, radius = 2, label = ~paste0(name, " (", sector_label, ")"))
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Make a color palette called pal for the values of `sector_label` using `colorFactor()`
# Colors should be: "red", "blue", and "#9b4a11" for "Public", "Private", and "For-Profit" colleges, respectively
pal <- colorFactor(palette = c("red", "blue", "#9b4a11"),
levels = c("Public", "Private", "For-Profit")
)
# Add circle markers that color colleges using pal() and the values of sector_label
map2 <- map %>%
addCircleMarkers(data = ca, radius = 2,
color = ~pal(sector_label),
label = ~paste0(name, " (", sector_label, ")")
)
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Print map2
map2
# Add a legend that displays the colors used in pal
map2 %>%
addLegend(pal = pal, values = c("Public", "Private", "For-Profit"))
# Customize the legend
map2 %>%
addLegend(pal = pal,
values = c("Public", "Private", "For-Profit"),
# opacity of .5, title of Sector, and position of topright
opacity = 0.5, title = "Sector", position = "topright"
)
Chapter 3 - Groups, Layers, Extras
Leaflet Extras Package:
Overlay Groups - ability to control the segments that are displayed on the map:
Base Groups - can provide multiple options for toggling (only one may be selected at a time):
Pieces of Flair:
Example code includes:
library(leaflet.extras)
library(htmltools)
leaflet() %>%
addTiles() %>%
addSearchOSM() %>%
addReverseSearchOSM()
m2 <- ipeds %>%
leaflet() %>%
# use the CartoDB provider tile
addProviderTiles("CartoDB") %>%
# center on the middle of the US with zoom of 3
setView(lat = 39.8282, lng = -98.5795, zoom=3)
# Map all American colleges
m2 %>%
addCircleMarkers()
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Create data frame called public with only public colleges
public <- filter(ipeds, sector_label == "Public")
# Create a leaflet map of public colleges called m3
m3 <- leaflet() %>%
addProviderTiles("CartoDB") %>%
addCircleMarkers(data = public, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "Public"
)
## Assuming "lng" and "lat" are longitude and latitude, respectively
m3
# Create data frame called private with only private colleges
private <- filter(ipeds, sector_label == "Private")
# Add private colleges to `m3` as a new layer
m3 <- m3 %>%
addCircleMarkers(data = private, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "Private"
) %>%
addLayersControl(overlayGroups = c("Public", "Private"))
## Assuming "lng" and "lat" are longitude and latitude, respectively
m3
# Create data frame called profit with only for-profit colleges
profit <- filter(ipeds, sector_label == "For-Profit")
# Add for-profit colleges to `m3` as a new layer
m3 <- m3 %>%
addCircleMarkers(data = profit, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "For-Profit"
) %>%
addLayersControl(overlayGroups = c("Public", "Private", "For-Profit"))
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Center the map on the middle of the US with a zoom of 4
m4 <- m3 %>%
setView(lat = 39.8282, lng = -98.5795, zoom = 4)
m4
leaflet() %>%
# Add the OSM, CartoDB and Esri tiles
addTiles(group = "OSM") %>%
addProviderTiles("CartoDB", group = "Carto") %>%
addProviderTiles("Esri", group = "Esri") %>%
# Use addLayersControl to allow users to toggle between basemaps
addLayersControl(baseGroups = c("OSM", "Carto", "Esri"))
m4 <- leaflet() %>%
addTiles(group = "OSM") %>%
addProviderTiles("CartoDB", group = "Carto") %>%
addProviderTiles("Esri", group = "Esri") %>%
addCircleMarkers(data = public, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "Public"
) %>%
addCircleMarkers(data = private, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "Private"
) %>%
addCircleMarkers(data = profit, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "For-Profit"
) %>%
addLayersControl(baseGroups = c("OSM", "Carto", "Esri"),
overlayGroups = c("Public", "Private", "For-Profit")
) %>%
setView(lat = 39.8282, lng = -98.5795, zoom = 4)
## Assuming "lng" and "lat" are longitude and latitude, respectively
## Assuming "lng" and "lat" are longitude and latitude, respectively
## Assuming "lng" and "lat" are longitude and latitude, respectively
m4
ipeds %>%
leaflet() %>%
addTiles() %>%
# Sanitize any html in our labels
addCircleMarkers(radius = 2, label = ~htmlEscape(name),
# Color code colleges by sector using the `pal` color palette
color = ~pal(sector_label),
# Cluster all colleges using `clusterOptions`
clusterOptions = markerClusterOptions()
)
## Assuming "lng" and "lat" are longitude and latitude, respectively
Chapter 4 - Plotting Polygons
Spatial Data - ability to plot polygons rather than points:
Mapping Polygons - can pipe SPDF in to a series of leaflet calls:
Putting Everything Together:
Wrap up - additional resources:
Example code includes:
load("./RInputFiles/nc_zips.Rda")
load("./RInputFiles/wealthiest_zips.Rda")
nc_income <- readr::read_csv("./RInputFiles/mean_income_by_zip_nc.csv")
## Parsed with column specification:
## cols(
## zipcode = col_double(),
## returns = col_double(),
## income = col_double(),
## mean_income = col_double()
## )
str(nc_income, give.attr = FALSE)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 723 obs. of 4 variables:
## $ zipcode : num 28207 28211 27608 28480 27517 ...
## $ returns : num 4470 14060 5690 1510 12710 ...
## $ income : num 2.46e+09 3.32e+09 1.13e+09 2.41e+08 1.97e+09 ...
## $ mean_income: num 550849 235961 197725 159617 154682 ...
# Print a summary of the `shp` data
summary(shp)
## Loading required package: sp
## Object of class SpatialPolygonsDataFrame
## Coordinates:
## min max
## x -84.32187 -75.46089
## y 33.84232 36.58812
## Is projected: FALSE
## proj4string :
## [+init=epsg:4326 +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84
## +towgs84=0,0,0]
## Data attributes:
## GEOID10 ALAND10
## 27006 : 1 100240769: 1
## 27007 : 1 100252722: 1
## 27009 : 1 1003885 : 1
## 27011 : 1 100620829: 1
## 27012 : 1 100707703: 1
## 27013 : 1 101001856: 1
## (Other):802 (Other) :802
# Print the class of `shp`
class(shp)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
# Print the slot names of `shp`
slotNames(shp)
## [1] "data" "polygons" "plotOrder" "bbox" "proj4string"
# Glimpse the data slot of shp
glimpse(shp@data)
## Observations: 808
## Variables: 2
## $ GEOID10 <fct> 27925, 28754, 28092, 27217, 28711, 28666, 28602, 27841, 278...
## $ ALAND10 <fct> 624688620, 223734670, 317180853, 318965510, 258603117, 1204...
# Print the class of the data slot of shp
class(shp@data)
## [1] "data.frame"
# Print GEOID10
shp@data$GEOID10
## [1] 27925 28754 28092 27217 28711 28666 28602 27841 27831 28785 27504 27330
## [13] 28768 28658 28716 28139 27565 28394 27982 28025 28159 28382 28312 28342
## [25] 27839 27852 28723 28077 28039 28452 27306 28375 28713 28743 28717 28150
## [37] 28447 27205 27379 28425 27827 27540 28114 28451 27892 27249 28628 27873
## [49] 28781 27916 28705 28714 28101 28102 28445 28448 28458 28719 28478 28479
## [61] 28501 28748 28752 28207 28753 28757 28209 28212 28560 28504 27983 27985
## [73] 28018 28019 28562 28906 28530 28771 28779 28782 28376 28581 28152 28169
## [85] 28170 28657 28021 28204 28533 28540 28543 28551 28262 28280 28575 28790
## [97] 28792 28667 28672 28108 28462 28681 28465 28734 28739 28694 28697 28702
## [109] 28745 28127 28420 28422 28424 28428 28435 28088 28089 28090 27562 28334
## [121] 28787 28433 27360 27534 28043 27370 28444 27531 28675 28712 28449 27053
## [133] 27944 28367 28326 28740 28659 28282 27244 27597 27017 28761 28457 28441
## [145] 27956 27889 28652 28146 28513 28777 28786 27596 27530 28369 28327 27340
## [157] 27028 27823 27879 28244 27810 27886 28306 27025 27239 27967 27824 27826
## [169] 27834 27030 28358 28365 27520 27524 27525 27526 27292 27874 27882 27883
## [181] 27885 27253 27576 27577 27582 27295 27298 27332 27910 27052 27055 27344
## [193] 27516 27850 27856 27265 27603 27605 27537 27539 27541 28601 28604 27809
## [205] 27278 27284 27371 27201 27312 28320 28325 27207 28330 28607 28611 28612
## [217] 27549 27555 27317 27320 27703 27709 28350 28643 28337 28621 27569 28645
## [229] 28651 27948 28630 27923 27929 27936 27943 28721 28512 27546 27891 28379
## [241] 27822 27909 28655 28662 27587 27589 28625 28742 28553 27941 28134 27043
## [253] 27893 28328 28135 28007 28338 27110 28472 28756 28110 28519 27861 27407
## [265] 28374 28211 28668 27214 27965 27949 27806 28340 27917 27288 27563 28669
## [277] 27229 27283 27109 27843 27047 28303 28585 28676 28689 28305 28635 28640
## [289] 27016 27863 27968 28528 27915 27981 28411 28577 27326 27954 28556 27105
## [301] 27545 27813 27974 27301 28168 28670 28801 27050 28610 28665 28125 28538
## [313] 27849 28036 28586 27801 27807 28904 27875 28557 27958 28468 27536 28213
## [325] 28341 28747 28707 27262 28006 28360 28031 27845 28166 28616 27572 27014
## [337] 27503 27011 28572 28386 27291 28432 27804 27343 28073 28467 28173 28539
## [349] 28352 27828 28515 28555 27855 27583 28310 28396 28348 28138 28642 27542
## [361] 27408 28215 27821 28105 28270 28206 28301 27876 28627 27019 28574 28647
## [373] 28806 27349 28091 28660 28726 28508 27840 28803 28511 27964 27978 28086
## [385] 27927 28774 28383 27559 28523 28332 28749 27962 27455 28056 27501 28027
## [397] 27527 27282 27837 28682 27310 28356 27233 27231 27006 28144 27857 27042
## [409] 28314 27612 28525 27281 28147 28366 28629 27523 27937 28119 28012 27048
## [421] 27880 27350 27027 27606 27938 28638 28720 28580 27103 27986 28001 28034
## [433] 28393 28032 28040 28677 28395 28391 28678 28399 28455 28098 28401 28103
## [445] 28684 28685 28409 28071 28683 28083 28708 28097 28450 28431 28453 28454
## [457] 28709 28439 28377 28715 28443 28436 28438 28751 28129 28133 28763 28109
## [469] 28120 28466 28746 28137 28480 28759 28731 28762 28405 28054 28698 28081
## [481] 28403 28052 28701 28690 28412 28704 28078 28421 28693 28544 28516 28773
## [493] 28775 28905 28174 28203 28570 28208 28210 28202 28804 28805 28791 28901
## [505] 28547 28107 28722 28729 28461 28730 28463 28552 28554 28115 28732 28112
## [517] 28214 28733 28308 28304 28571 28584 28582 28583 28273 28587 28278 28578
## [529] 28579 28323 28164 28605 28518 28520 28526 28783 28529 28167 28521 28531
## [541] 28311 28163 28537 28772 28626 27942 27928 28634 28649 28339 28357 27935
## [553] 28623 28618 28654 28624 28619 27922 28307 28226 27946 27947 28347 28349
## [565] 28227 28637 27926 27920 28646 28573 27921 28351 28269 28590 27341 28364
## [577] 27604 27976 28615 27357 28344 28613 28609 28343 27409 27376 27377 27701
## [589] 27610 27979 27405 27704 27705 27959 27960 27403 27966 27953 27970 27972
## [601] 27973 27707 27957 27401 27517 27502 27507 27508 27509 27510 27518 27505
## [613] 27020 27613 27024 27514 27519 27713 27614 27803 27616 27617 27513 27511
## [625] 27023 27046 27844 27869 27853 27051 27041 27521 27871 27872 27842 27106
## [637] 27830 27846 27013 27862 27104 27832 27847 27858 27865 27851 27825 27829
## [649] 27012 27816 27817 27557 27808 27209 27208 27820 27888 27814 27551 27556
## [661] 27045 27235 27560 27215 27054 27248 27242 27260 27243 27258 27581 27812
## [673] 27601 27592 27591 27544 27316 27313 27325 27314 27311 27896 27007 28650
## [685] 28606 27009 28735 28673 28725 28033 27870 27864 28429 28384 28663 27022
## [697] 28333 27574 28524 28527 28277 27263 28023 27573 27615 28020 28464 28128
## [709] 28009 28205 28104 27299 27884 28076 28080 28160 28532 27302 28124 27932
## [721] 27924 28037 27819 27608 28789 28079 28398 27553 27878 27018 27040 28392
## [733] 27315 28594 27950 28442 27410 27805 28371 27305 28778 28692 28072 28456
## [745] 28589 28363 27355 27358 28385 28736 27890 27522 28617 28671 28387 28390
## [757] 27212 27609 27568 28679 27881 27101 28622 28644 28631 28636 28373 28345
## [769] 27712 28117 27866 27021 27406 28741 28372 27897 28430 27980 28017 27203
## [781] 28909 27127 27607 27939 28217 28216 27252 28423 28718 27919 28510 28460
## [793] 28434 28470 28766 28546 27818 27529 28469 28016 28075 28318 27107 27356
## [805] 28315 27571 27860 28902
## 33144 Levels: 00601 00602 00603 00606 00610 00612 00616 00617 00622 ... 99929
shp@data$GEOID10 <- as.integer(as.character(shp@data$GEOID10))
str(shp@data$GEOID10)
## int [1:808] 27925 28754 28092 27217 28711 28666 28602 27841 27831 28785 ...
# Glimpse the nc_income data
glimpse(nc_income)
## Observations: 723
## Variables: 4
## $ zipcode <dbl> 28207, 28211, 27608, 28480, 27517, 27614, 28173, 28036,...
## $ returns <dbl> 4470, 14060, 5690, 1510, 12710, 15670, 21880, 7640, 910...
## $ income <dbl> 2462295000, 3317607000, 1125055000, 241022000, 19660110...
## $ mean_income <dbl> 550849.0, 235960.7, 197725.0, 159617.2, 154682.2, 15367...
# Summarise the nc_income data
summary(nc_income)
## zipcode returns income mean_income
## Min. :27006 Min. : 110 Min. :4.557e+06 Min. : 26625
## 1st Qu.:27605 1st Qu.: 1105 1st Qu.:4.615e+07 1st Qu.: 40368
## Median :28115 Median : 3050 Median :1.526e+08 Median : 46288
## Mean :28062 Mean : 5979 Mean :3.648e+08 Mean : 53338
## 3rd Qu.:28521 3rd Qu.: 9050 3rd Qu.:4.670e+08 3rd Qu.: 55917
## Max. :28909 Max. :37020 Max. :3.970e+09 Max. :550849
# Left join nc_income onto shp@data and store in shp_nc_income
shp_nc_income <- shp@data %>%
left_join(nc_income, by = c("GEOID10" = "zipcode"))
# Print the number of missing values of each variable in shp_nc_income
shp_nc_income %>%
summarise_all(funs(sum(is.na(.))))
## GEOID10 ALAND10 returns income mean_income
## 1 0 0 85 85 85
shp <- merge(shp, shp_nc_income, by=c("GEOID10", "ALAND10"))
# map the polygons in shp
shp %>%
leaflet() %>%
addTiles() %>%
addPolygons()
# which zips were not in the income data?
shp_na <- shp[is.na(shp$mean_income),]
# map the polygons in shp_na
shp_na %>%
leaflet() %>%
addTiles() %>%
addPolygons()
# summarise the mean income variable
summary(shp$mean_income)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 26625 40368 46288 53338 55917 550849 85
# subset shp to include only zip codes in the top quartile of mean income
high_inc <- shp[!is.na(shp$mean_income) & shp$mean_income > 55917,]
# map the boundaries of the zip codes in the top quartile of mean income
high_inc %>%
leaflet() %>%
addTiles() %>%
addPolygons()
dollar <- function (x, negative_parens=TRUE, prefix="$", suffix="") {
# KLUGE to make this work . . .
needs_cents <- function(...) { FALSE }
if (length(x) == 0)
return(character())
x <- plyr::round_any(x, 0.01)
if (needs_cents(x, largest_with_cents)) {
nsmall <- 2L
}
else {
x <- plyr::round_any(x, 1)
nsmall <- 0L
}
negative <- !is.na(x) & x < 0
if (negative_parens) {
x <- abs(x)
}
amount <- format(abs(x), nsmall = nsmall, trim = TRUE, big.mark = ",", scientific = FALSE, digits = 1L)
if (negative_parens) {
paste0(ifelse(negative, "(", ""), prefix, amount, suffix, ifelse(negative, ")", ""))
}
else {
paste0(prefix, ifelse(negative, "-", ""), amount, suffix)
}
}
# create color palette with colorNumeric()
nc_pal <- colorNumeric("YlGn", domain = high_inc@data$mean_income)
high_inc %>%
leaflet() %>%
addTiles() %>%
# set boundary thickness to 1 and color polygons blue
addPolygons(weight = 1, color = ~nc_pal(mean_income),
# add labels that display mean income
label = ~paste0("Mean Income: ", dollar(mean_income)),
# highlight polygons on hover
highlight = highlightOptions(weight = 5, color = "white",
bringToFront = TRUE))
# Create a logged version of the nc_pal color palette
nc_pal <- colorNumeric("YlGn", domain = log(high_inc@data$mean_income))
# apply the nc_pal
high_inc %>%
leaflet() %>%
addProviderTiles("CartoDB") %>%
addPolygons(weight = 1, color = ~nc_pal(log(mean_income)), fillOpacity = 1,
label = ~paste0("Mean Income: ", dollar(mean_income)),
highlightOptions = highlightOptions(weight = 5, color = "white", bringToFront = TRUE))
# Print the slot names of `wealthy_zips`
slotNames(wealthy_zips)
## [1] "data" "polygons" "plotOrder" "bbox" "proj4string"
# Print a summary of the `mean_income` variable
summary(wealthy_zips$mean_income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 200444 229914 279330 339859 371904 2553591
# plot zip codes with mean incomes >= $200k
wealthy_zips %>%
leaflet() %>%
addProviderTiles("CartoDB") %>%
addPolygons(weight = 1, fillOpacity = .7, color = "Green", group = "Wealthy Zipcodes",
label = ~paste0("Mean Income: ", dollar(mean_income)),
highlightOptions = highlightOptions(weight = 5, color = "white", bringToFront = TRUE))
# Add polygons using wealthy_zips
final_map <- m4 %>%
addPolygons(data = wealthy_zips, weight = 1, fillOpacity = .5, color = "Grey", group = "Wealthy Zip Codes",
label = ~paste0("Mean Income: ", dollar(mean_income)),
highlight = highlightOptions(weight = 5, color = "white", bringToFront = TRUE)) %>%
# Update layer controls including "Wealthy Zip Codes"
addLayersControl(baseGroups = c("OSM", "Carto", "Esri"),
overlayGroups = c("Public", "Private", "For-Profit", "Wealthy Zip Codes"))
# Print and explore your very last map of the course!
final_map
Chapter 1 - Introduction
Sugar content of soft drinks:
Generating a linearly separable dataset
Example code includes:
df <- data.frame(sample=1:25,
sugar_content=c(10.9, 10.9, 10.6, 10, 8, 8.2, 8.6, 10.9, 10.7, 8, 7.7, 7.8, 8.4, 11.5, 11.2, 8.9, 8.7, 7.4, 10.9, 10, 11.4, 10.8, 8.5, 8.2, 10.6)
)
str(df)
## 'data.frame': 25 obs. of 2 variables:
## $ sample : int 1 2 3 4 5 6 7 8 9 10 ...
## $ sugar_content: num 10.9 10.9 10.6 10 8 8.2 8.6 10.9 10.7 8 ...
#print variable names
names(df)
## [1] "sample" "sugar_content"
#build plot
plot_ <- ggplot(data = df, aes(x = sugar_content, y = c(0))) +
geom_point() +
geom_text(label = df$sugar_content, size = 2.5, vjust = 2, hjust = 0.5)
#display plot
plot_
#The maximal margin separator is at the midpoint of the two extreme points in each cluster.
mm_separator <- (8.9 + 10)/2
#create data frame
separator <- data.frame(sep = c(mm_separator))
#add ggplot layer
plot_ <- plot_ +
geom_point(data = separator, x = separator$sep, y = c(0), color = "blue", size = 4)
#display plot
plot_
#set seed
set.seed(42)
#set number of data points.
n <- 600
#Generate data frame with two uniformly distributed predictors lying between 0 and 1.
df <- data.frame(x1 = runif(n), x2 = runif(n))
#classify data points depending on location
df$y <- factor(ifelse(df$x2 - 1.4*df$x1 < 0, -1, 1), levels = c(-1, 1))
#set margin
delta <- 0.07
# retain only those points that lie outside the margin
df1 <- df[abs(1.4*df$x1 - df$x2) > delta, ]
#build plot
plot_margins <- ggplot(data = df1, aes(x = x1, y = x2, color = y)) + geom_point() +
scale_color_manual(values = c("red", "blue")) +
geom_abline(slope = 1.4, intercept = 0)+
geom_abline(slope = 1.4, intercept = delta, linetype = "dashed") +
geom_abline(slope = 1.4, intercept = -delta, linetype = "dashed")
#display plot
plot_margins
Chapter 2 - Support Vector Classifiers - Linear Kernels
Linear Support Vector Machines:
Visualizing Linear SVM:
Tuning Linear SVM:
Multi-class problems:
Example code includes:
dfOld <- df
delta <- 0.07
df <- df[abs(1.4*df$x1 - df$x2) > delta, ]
#split train and test data in an 80/20 proportion
df[, "train"] <- ifelse(runif(nrow(df))<0.8, 1, 0)
#assign training rows to data frame trainset
trainset <- df[df$train == 1, ]
#assign test rows to data frame testset
testset <- df[df$train == 0, ]
#find index of "train" column
trainColNum <- grep("train", names(df))
#remove "train" column from train and test dataset
trainset <- trainset[, -trainColNum]
testset <- testset[, -trainColNum]
library(e1071)
#build svm model, setting required parameters
svm_model<- svm(y ~ .,
data = trainset,
type = "C-classification",
kernel = "linear",
scale = FALSE)
#list components of model
names(svm_model)
## [1] "call" "type" "kernel" "cost"
## [5] "degree" "gamma" "coef0" "nu"
## [9] "epsilon" "sparse" "scaled" "x.scale"
## [13] "y.scale" "nclasses" "levels" "tot.nSV"
## [17] "nSV" "labels" "SV" "index"
## [21] "rho" "compprob" "probA" "probB"
## [25] "sigma" "coefs" "na.action" "fitted"
## [29] "decision.values" "terms"
#list values of the SV, index and rho
svm_model$SV
## x1 x2
## 11 0.4577417762 0.476919189
## 19 0.4749970816 0.486642912
## 45 0.4317512489 0.520339758
## 58 0.1712643304 0.100229354
## 61 0.6756072745 0.772399305
## 69 0.6932048204 0.838569788
## 99 0.7439746463 0.912029979
## 101 0.6262453445 0.765520479
## 103 0.2165673110 0.202548483
## 118 0.3556659538 0.298152283
## 143 0.4640695513 0.535269056
## 144 0.7793681615 0.941694443
## 147 0.1701624813 0.050030747
## 173 0.4140496817 0.380267640
## 176 0.1364903601 0.011009041
## 180 0.7690324257 0.951921815
## 194 0.1290892835 0.021196302
## 199 0.7431877197 0.824081728
## 204 0.4427962683 0.532290264
## 209 0.2524584394 0.281511990
## 226 0.8205145481 0.962842692
## 253 0.2697161783 0.288755647
## 268 0.2050496121 0.182046106
## 272 0.7853494422 0.870432480
## 278 0.4037828147 0.476424339
## 286 0.1709963905 0.164468810
## 294 0.3864540118 0.370921416
## 295 0.3324459905 0.382318948
## 325 0.5648222226 0.618285144
## 338 0.3169501573 0.333509587
## 341 0.4091320913 0.496387038
## 344 0.3597852497 0.345139100
## 393 0.6568108753 0.815567016
## 400 0.0755990995 0.007417523
## 406 0.1079870730 0.022227321
## 413 0.2401496081 0.151690785
## 427 0.4664852461 0.464965629
## 443 0.3626018071 0.369346223
## 450 0.0619409799 0.011438249
## 466 0.6399842701 0.695480783
## 479 0.1730011790 0.136427131
## 503 0.5195604505 0.627322678
## 525 0.6494539515 0.833293378
## 526 0.6903516576 0.790328991
## 535 0.4243346907 0.470753220
## 590 0.7148487861 0.902375512
## 595 0.8058112133 0.937903824
## 600 0.4587231132 0.446819442
## 15 0.4622928225 0.839631285
## 29 0.4469696281 0.721333573
## 37 0.0073341469 0.108096598
## 38 0.2076589728 0.519075874
## 59 0.2610879638 0.472588875
## 90 0.3052183695 0.548420829
## 92 0.0002388966 0.122946701
## 102 0.2171576982 0.505044580
## 104 0.3889450287 0.717138722
## 129 0.2335235255 0.439058027
## 132 0.6034740848 0.958318281
## 133 0.6315072989 0.970767964
## 158 0.0290858189 0.148069276
## 175 0.4274944656 0.725024226
## 178 0.5923042425 0.900228734
## 189 0.1333296183 0.390023998
## 196 0.0531294835 0.276241161
## 202 0.5171110556 0.899924811
## 210 0.2596899802 0.503687580
## 215 0.4513108502 0.743930877
## 229 0.0483467767 0.218475638
## 232 0.1590223818 0.402696270
## 237 0.0865806018 0.263718613
## 239 0.5545858634 0.935806216
## 249 0.4992728804 0.812805236
## 258 0.5397982858 0.932383237
## 276 0.3367135401 0.672058288
## 293 0.3152607968 0.625878707
## 309 0.3199476011 0.541676977
## 311 0.1078112544 0.374908455
## 378 0.1084886545 0.376079086
## 409 0.0842775232 0.235715229
## 419 0.4264662997 0.798480970
## 420 0.0600483362 0.298929408
## 430 0.5141573721 0.908452330
## 451 0.4309255683 0.821331850
## 477 0.5964720468 0.913432184
## 481 0.2329343846 0.409654615
## 482 0.5770482090 0.969947845
## 488 0.2485451805 0.533491509
## 520 0.5784583788 0.907620618
## 524 0.1270027745 0.348539336
## 530 0.2665205784 0.458110426
## 540 0.2131855546 0.530223881
## 558 0.2770604359 0.510976796
## 562 0.2056735931 0.433566746
## 580 0.5705413527 0.994652604
## 581 0.2458533479 0.494881822
svm_model$index
## [1] 8 11 30 39 42 48 71 73 75 85 103 104 105 124 127 131 141 145 149
## [20] 153 167 188 196 199 204 208 215 216 240 251 254 257 290 297 300 306 317 329
## [39] 335 347 353 372 390 391 398 440 443 445 9 18 23 24 40 63 65 74 76
## [58] 92 95 96 114 126 129 137 143 148 154 157 170 173 178 180 186 192 202 214
## [77] 228 229 281 303 310 311 318 336 351 355 356 362 385 389 395 402 415 419 431
## [96] 432
svm_model$rho
## [1] -0.1641859
#compute training accuracy
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)
## [1] 1
#compute test accuracy
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 1
#build scatter plot of training dataset
scatter_plot <- ggplot(data = trainset, aes(x = x1, y = x2, color = y)) +
geom_point() +
scale_color_manual(values = c("red", "blue"))
#add plot layer marking out the support vectors
layered_plot <-
scatter_plot + geom_point(data = trainset[svm_model$index, ], aes(x = x1, y = x2), color = "purple", size = 4, alpha = 0.5)
#display plot
layered_plot
#calculate slope and intercept of decision boundary from weight vector and svm model
w <- c(x1=6.55241, x2=-4.73278) # calculated manually outside of this module
slope_1 <- -w[1]/w[2]
intercept_1 <- svm_model$rho/w[2]
#build scatter plot of training dataset
scatter_plot <- ggplot(data = trainset, aes(x = x1, y = x2, color = y)) +
geom_point() + scale_color_manual(values = c("red", "blue"))
#add decision boundary
plot_decision <- scatter_plot + geom_abline(slope = slope_1, intercept = intercept_1)
#add margin boundaries
plot_margins <- plot_decision +
geom_abline(slope = slope_1, intercept = intercept_1 - 1/w[2], linetype = "dashed")+
geom_abline(slope = slope_1, intercept = intercept_1 + 1/w[2], linetype = "dashed")
#display plot
plot_margins
#build svm model
svm_model<-
svm(y ~ ., data = trainset, type = "C-classification",
kernel = "linear", scale = FALSE)
#plot decision boundaries and support vectors
plot(x = svm_model, data = trainset)
#build svm model, cost = 1
svm_model_1 <- svm(y ~ .,
data = trainset,
type = "C-classification",
cost = 1,
kernel = "linear",
scale = FALSE)
#print model details
svm_model_1
##
## Call:
## svm(formula = y ~ ., data = trainset, type = "C-classification",
## cost = 1, kernel = "linear", scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 96
#build svm model, cost = 100
svm_model_100 <- svm(y ~ .,
data = trainset,
type = "C-classification",
cost = 100,
kernel = "linear",
scale = FALSE)
#print model details
svm_model_100
##
## Call:
## svm(formula = y ~ ., data = trainset, type = "C-classification",
## cost = 100, kernel = "linear", scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 100
##
## Number of Support Vectors: 6
# Create the base train_plot
train_plot <- ggplot(data = trainset, aes(x = x1, y = x2, color = y)) +
geom_point() + scale_color_manual(values = c("red", "blue"))
w_1 <- c(x1=6.55241, x2=-4.73278) # calculated manually outside of this module
w_100 <- c(x1=18.3097, x2=-13.09972) # calculated manually outside of this module
intercept_1 <- -0.005515526 # calculated outside of this module
intercept_100 <- 0.001852543 # calculated outside of this module
slope_1 <- -w_1[1]/w_1[2]
slope_100 <- -w_100[1]/w_100[2]
#add decision boundary and margins for cost = 1 to training data scatter plot
train_plot_with_margins <- train_plot +
geom_abline(slope = slope_1, intercept = intercept_1) +
geom_abline(slope = slope_1, intercept = intercept_1 - 1/w_1[2], linetype = "dashed")+
geom_abline(slope = slope_1, intercept = intercept_1 + 1/w_1[2], linetype = "dashed")
#display plot
train_plot_with_margins
#add decision boundary and margins for cost = 100 to training data scatter plot
train_plot_with_margins <- train_plot_with_margins +
geom_abline(slope = slope_100, intercept = intercept_100, color = "goldenrod") +
geom_abline(slope = slope_100, intercept = intercept_100 - 1/w_100[2], linetype = "dashed", color = "goldenrod")+
geom_abline(slope = slope_100, intercept = intercept_100 + 1/w_100[2], linetype = "dashed", color = "goldenrod")
#display plot
train_plot_with_margins
svm_model<- svm(y ~ ., data = trainset, type = "C-classification", kernel = "linear", scale = FALSE)
#compute training accuracy
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)
## [1] 1
#compute test accuracy
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 1
#plot
plot(svm_model, trainset)
data(iris)
nTrials <- 100
accuracy <- numeric(nTrials)
#calculate accuracy for n distinct 80/20 train/test partitions
for (i in 1:nTrials){
iris[, "train"] <- ifelse(runif(nrow(iris))<0.8, 1, 0)
trainColNum <- grep("train", names(iris))
trainset <- iris[iris$train == 1, -trainColNum]
testset <- iris[iris$train == 0, -trainColNum]
svm_model <- svm(Species~ ., data = trainset,
type = "C-classification", kernel = "linear")
pred_test <- predict(svm_model, testset)
accuracy[i] <- mean(pred_test == testset$Species)
}
#mean accuracy and standard deviation
mean(accuracy)
## [1] 0.9643194
sd(accuracy)
## [1] 0.03704363
Chapter 3 - Polynomial Kernels
Generating radially separable datasets:
Linear SVM on radially separable datasets:
Kernel trick - devise a mathematical transformation that makes the data linearly separable:
Tuning SVM:
Example code includes:
#set number of variables and seed
n <- 400
set.seed(1)
#Generate data frame with two uniformly distributed predictors, x1 and x2
df <- data.frame(x1 = runif(n, min = -1, max = 1), x2 = runif(n, min = -1, max = 1))
#We want a circular boundary. Set boundary radius
radius <- 0.8
radius_squared <- radius^2
#create dependent categorical variable, y, with value -1 or 1 depending on whether point lies
#within or outside the circle.
df$y <- factor(ifelse(df$x1**2 + df$x2**2 < radius_squared, -1, 1), levels = c(-1, 1))
#build scatter plot, distinguish class by color
scatter_plot <- ggplot(data = df, aes(x = x1, y = x2, color = y)) +
geom_point() +
scale_color_manual(values = c("red", "blue"))
#display plot
scatter_plot
inTrain <- sample(1:nrow(df), round(0.75*nrow(df)), replace=FALSE)
trainset <- df[sort(inTrain), ]
testset <- df[-inTrain, ]
#default cost mode;
svm_model_1 <- svm(y ~ ., data = trainset, type = "C-classification", cost = 1, kernel = "linear")
#training accuracy
pred_train <- predict(svm_model_1, trainset)
mean(pred_train == trainset$y)
## [1] 0.64
#test accuracy
pred_test <- predict(svm_model_1, testset)
mean(pred_test == testset$y)
## [1] 0.48
#cost = 100 model
svm_model_100 <- svm(y ~ ., data = trainset, type = "C-classification", cost = 100, kernel = "linear")
#accuracy
pred_train <- predict(svm_model_100, trainset)
mean(pred_train == trainset$y)
## [1] 0.64
pred_test <- predict(svm_model_100, testset)
mean(pred_test == testset$y)
## [1] 0.48
#print average accuracy and standard deviation
accuracy <- rep(NA, 100)
set.seed(2)
#comment
for (i in 1:100){
df[, "train"] <- ifelse(runif(nrow(df))<0.8, 1, 0)
trainset <- df[df$train == 1, ]
testset <- df[df$train == 0, ]
trainColNum <- grep("train", names(trainset))
trainset <- trainset[, -trainColNum]
testset <- testset[, -trainColNum]
svm_model<- svm(y ~ ., data = trainset, type = "C-classification", kernel = "linear")
pred_test <- predict(svm_model, testset)
accuracy[i] <- mean(pred_test == testset$y)
}
#print average accuracy and standard deviation
mean(accuracy)
## [1] 0.5554571
sd(accuracy)
## [1] 0.04243524
#transform data
df1 <- data.frame(x1sq = df$x1^2, x2sq = df$x2^2, y = df$y)
#plot data points in the transformed space
plot_transformed <- ggplot(data = df1, aes(x = x1sq, y = x2sq, color = y)) +
geom_point()+ guides(color = FALSE) +
scale_color_manual(values = c("red", "blue"))
#add decision boundary and visualize
plot_decision <- plot_transformed + geom_abline(slope = -1, intercept = 0.64)
plot_decision
# Still want to use the old (non-squared) data
inTrain <- sample(1:nrow(df), round(0.75*nrow(df)), replace=FALSE)
df$train <- NULL
trainset <- df[sort(inTrain), ]
testset <- df[-inTrain, ]
svm_model <- svm(y ~ ., data = trainset, type = "C-classification", kernel = "polynomial", degree = 2)
#measure training and test accuracy
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)
## [1] 0.9866667
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 0.98
#plot
plot(svm_model, trainset)
#tune model
tune_out <-
tune.svm(x = trainset[, -3], y = trainset[, 3],
type = "C-classification",
kernel = "polynomial", degree = 2, cost = 10^(-1:2),
gamma = c(0.1, 1, 10), coef0 = c(0.1, 1, 10))
#list optimal values
tune_out$best.parameters$cost
## [1] 0.1
tune_out$best.parameters$gamma
## [1] 10
tune_out$best.parameters$coef0
## [1] 0.1
#Build tuned model
svm_model <- svm(y ~ ., data = trainset, type = "C-classification",
kernel = "polynomial", degree = 2,
cost = tune_out$best.parameters$cost,
gamma = tune_out$best.parameters$gamma,
coef0 = tune_out$best.parameters$coef0)
#Calculate training and test accuracies
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)
## [1] 0.9966667
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 1
#plot model
plot(svm_model, trainset)
Chapter 4 - Radial Basis Kernel Functions
Generating complex datasets:
Motivating the RBF kernel:
The RBF kernel simulates some of the principles of kNN using exponential decay:
Example code includes:
#number of data points
n <- 1000
#set seed
set.seed(1)
#create dataframe
df <- data.frame(x1 = rnorm(n, mean = -0.5, sd = 1), x2 = runif(n, min = -1, max = 1))
#set radius and centers
radius <- 0.8
center_1 <- c(-0.8, 0)
center_2 <- c(0.8, 0)
radius_squared <- radius^2
#create binary classification variable
df$y <- factor(ifelse((df$x1-center_1[1])^2 + (df$x2-center_1[2])^2 < radius_squared |
(df$x1-center_2[1])^2 + (df$x2-center_2[2])^2 < radius_squared, -1, 1),
levels = c(-1, 1))
#create scatter plot
scatter_plot<- ggplot(data = df, aes(x = x1, y = x2, color = y)) +
geom_point() +
scale_color_manual(values = c("red", "blue"))
scatter_plot
# Create 75/25 split for train/test
inTrain <- sample(1:nrow(df), round(0.75*nrow(df)), replace=FALSE)
trainset <- df[sort(inTrain), ]
testset <- df[-inTrain, ]
#build model
svm_model <- svm(y ~ ., data = trainset, type = "C-classification", kernel = "linear")
#accuracy
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)
## [1] 0.5853333
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 0.564
#plot model against testset
plot(svm_model, testset)
#build model
svm_model <- svm(y ~ ., data = trainset, type = "C-classification", kernel = "polynomial", degree = 2)
#accuracy
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)
## [1] 0.8253333
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 0.788
#plot model
plot(svm_model, trainset)
#create vector to store accuracies and set random number seed
accuracy <- rep(NA, 100)
set.seed(2)
# Create a dummy frame dfDum for use in the for loop
dfDum <- df
#calculate accuracies for 100 training/test partitions
for (i in 1:100){
dfDum[, "train"] <- ifelse(runif(nrow(dfDum))<0.8, 1, 0)
trainset <- dfDum[dfDum$train == 1, ]
testset <- dfDum[dfDum$train == 0, ]
trainColNum <- grep("train", names(trainset))
trainset <- trainset[, -trainColNum]
testset <- testset[, -trainColNum]
svm_model<- svm(y ~ ., data = trainset, type = "C-classification", kernel = "polynomial", degree = 2)
pred_test <- predict(svm_model, testset)
accuracy[i] <- mean(pred_test == testset$y)
}
#print average accuracy and standard deviation
mean(accuracy)
## [1] 0.804765
sd(accuracy)
## [1] 0.02398396
#create vector to store accuracies and set random number seed
accuracy <- rep(NA, 100)
set.seed(2)
#calculate accuracies for 100 training/test partitions
for (i in 1:100){
dfDum[, "train"] <- ifelse(runif(nrow(dfDum))<0.8, 1, 0)
trainset <- dfDum[dfDum$train == 1, ]
testset <- dfDum[dfDum$train == 0, ]
trainColNum <- grep("train", names(trainset))
trainset <- trainset[, -trainColNum]
testset <- testset[, -trainColNum]
svm_model<- svm(y ~ ., data = trainset, type = "C-classification", kernel = "radial")
pred_test <- predict(svm_model, testset)
accuracy[i] <- mean(pred_test == testset$y)
}
#print average accuracy and standard deviation
mean(accuracy)
## [1] 0.9034203
sd(accuracy)
## [1] 0.01786378
# Re-create original 75/25 split for train/test
inTrain <- sample(1:nrow(df), round(0.75*nrow(df)), replace=FALSE)
trainset <- df[sort(inTrain), ]
testset <- df[-inTrain, ]
#tune model
tune_out <- tune.svm(x = trainset[, -3], y = trainset[, 3],
gamma = 5*10^(-2:2),
cost = c(0.01, 0.1, 1, 10, 100),
type = "C-classification", kernel = "radial")
tune_out
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 5 1
##
## - best performance: 0.04
#build tuned model
svm_model <- svm(y~ ., data = trainset, type = "C-classification", kernel = "radial",
cost = tune_out$best.parameters$cost,
gamma = tune_out$best.parameters$gamma)
#calculate test accuracy
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 0.956
#Plot decision boundary against test data
plot(svm_model, testset)
Chapter 1 - Introduction to Experimental Design
Introduction to experimental design:
Hypothesis testing:
Example code includes:
# load the ToothGrowth dataset
data("ToothGrowth")
#perform a two-sided t-test
t.test(x = ToothGrowth$len, alternative = "two.sided", mu = 18)
##
## One Sample t-test
##
## data: ToothGrowth$len
## t = 0.82361, df = 59, p-value = 0.4135
## alternative hypothesis: true mean is not equal to 18
## 95 percent confidence interval:
## 16.83731 20.78936
## sample estimates:
## mean of x
## 18.81333
#perform a t-test
ToothGrowth_ttest <- t.test(len ~ supp, data = ToothGrowth)
#tidy the t-test model object
broom::tidy(ToothGrowth_ttest)
## # A tibble: 1 x 10
## estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.70 20.7 17.0 1.92 0.0606 55.3 -0.171 7.57
## # ... with 2 more variables: method <chr>, alternative <chr>
#group by supp, dose, then examine how many observations in ToothGrowth there are by those groups
ToothGrowth %>%
group_by(supp, dose) %>%
summarize(n=n())
## # A tibble: 6 x 3
## # Groups: supp [2]
## supp dose n
## <fct> <dbl> <int>
## 1 OJ 0.5 10
## 2 OJ 1 10
## 3 OJ 2 10
## 4 VC 0.5 10
## 5 VC 1 10
## 6 VC 2 10
#create a boxplot with geom_boxplot()
ggplot(ToothGrowth, aes(x=as.factor(dose), y=len)) +
geom_boxplot()
#create the ToothGrowth_aov model object
ToothGrowth_aov <- aov(len ~ dose + supp, data = ToothGrowth)
#examine the model object with summary()
summary(ToothGrowth_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## dose 1 2224.3 2224.3 123.99 6.31e-16 ***
## supp 1 205.3 205.3 11.45 0.0013 **
## Residuals 57 1022.6 17.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#less than
t.test(x = ToothGrowth$len, alternative = "less", mu = 18)
##
## One Sample t-test
##
## data: ToothGrowth$len
## t = 0.82361, df = 59, p-value = 0.7933
## alternative hypothesis: true mean is less than 18
## 95 percent confidence interval:
## -Inf 20.46358
## sample estimates:
## mean of x
## 18.81333
#greater than
t.test(x = ToothGrowth$len, alternative = "greater", mu = 18)
##
## One Sample t-test
##
## data: ToothGrowth$len
## t = 0.82361, df = 59, p-value = 0.2067
## alternative hypothesis: true mean is greater than 18
## 95 percent confidence interval:
## 17.16309 Inf
## sample estimates:
## mean of x
## 18.81333
#calculate power
pwr::pwr.t.test(n = 100, d = 0.35, sig.level = 0.10, type = "two.sample",
alternative = "two.sided", power = NULL
)
##
## Two-sample t test power calculation
##
## n = 100
## d = 0.35
## sig.level = 0.1
## power = 0.7943532
## alternative = two.sided
##
## NOTE: n is number in *each* group
#calculate sample size
pwr::pwr.t.test(n = NULL, d = 0.25, sig.level = 0.05,
type = "one.sample", alternative = "greater", power = 0.8
)
##
## One-sample t test power calculation
##
## n = 100.2877
## d = 0.25
## sig.level = 0.05
## power = 0.8
## alternative = greater
Chapter 2 - Basic Experiments
Single and Multiple Factor Experiments:
Model Validation:
A/B Testing:
Example code includes:
lendingclub <- readr::read_csv("./RInputFiles/lendclub.csv")
## Parsed with column specification:
## cols(
## member_id = col_double(),
## loan_amnt = col_double(),
## funded_amnt = col_double(),
## term = col_character(),
## int_rate = col_double(),
## emp_length = col_character(),
## home_ownership = col_character(),
## annual_inc = col_double(),
## verification_status = col_character(),
## loan_status = col_character(),
## purpose = col_character(),
## grade = col_character()
## )
#examine the variables with glimpse()
glimpse(lendingclub)
## Observations: 1,500
## Variables: 12
## $ member_id <dbl> 55096114, 1555332, 1009151, 69524202, 72128084,...
## $ loan_amnt <dbl> 11000, 10000, 13000, 5000, 18000, 14000, 8000, ...
## $ funded_amnt <dbl> 11000, 10000, 13000, 5000, 18000, 14000, 8000, ...
## $ term <chr> "36 months", "36 months", "60 months", "36 mont...
## $ int_rate <dbl> 12.69, 6.62, 10.99, 12.05, 5.32, 16.99, 13.11, ...
## $ emp_length <chr> "10+ years", "10+ years", "3 years", "10+ years...
## $ home_ownership <chr> "RENT", "MORTGAGE", "MORTGAGE", "MORTGAGE", "MO...
## $ annual_inc <dbl> 51000, 40000, 78204, 51000, 96000, 47000, 40000...
## $ verification_status <chr> "Not Verified", "Verified", "Not Verified", "No...
## $ loan_status <chr> "Current", "Fully Paid", "Fully Paid", "Current...
## $ purpose <chr> "debt_consolidation", "debt_consolidation", "ho...
## $ grade <chr> "C", "A", "B", "C", "A", "D", "C", "A", "D", "B...
#find median loan_amt, mean int_rate, and mean annual_inc with summarise()
lendingclub %>% summarise(median(loan_amnt), mean(int_rate), mean(annual_inc))
## # A tibble: 1 x 3
## `median(loan_amnt)` `mean(int_rate)` `mean(annual_inc)`
## <dbl> <dbl> <dbl>
## 1 13000 13.3 75736.
# use ggplot2 to build a bar chart of purpose
ggplot(data=lendingclub, aes(x = purpose)) + geom_bar()
#use recode() to create the new purpose_recode variable.
lendingclub$purpose_recode <- lendingclub$purpose %>% recode(
"credit_card" = "debt_related",
"debt_consolidation" = "debt_related",
"medical" = "debt_related",
"car" = "big_purchase",
"major_purchase" = "big_purchase",
"vacation" = "big_purchase",
"moving" = "life_change",
"small_business" = "life_change",
"wedding" = "life_change",
"house" = "home_related",
"home_improvement" = "home_related"
)
#build a linear regression model, stored as purpose_recode_model
purpose_recode_model <- lm(funded_amnt ~ purpose_recode, data = lendingclub)
#look at results of purpose_recode_model
summary(purpose_recode_model)
##
## Call:
## lm(formula = funded_amnt ~ purpose_recode, data = lendingclub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14472 -6251 -1322 4678 25761
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9888.1 1248.9 7.917 4.69e-15 ***
## purpose_recodedebt_related 5433.5 1270.5 4.277 2.02e-05 ***
## purpose_recodehome_related 4845.0 1501.0 3.228 0.00127 **
## purpose_recodelife_change 4095.3 2197.2 1.864 0.06254 .
## purpose_recodeother -649.3 1598.3 -0.406 0.68461
## purpose_recoderenewable_energy -1796.4 4943.3 -0.363 0.71636
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8284 on 1494 degrees of freedom
## Multiple R-squared: 0.03473, Adjusted R-squared: 0.0315
## F-statistic: 10.75 on 5 and 1494 DF, p-value: 3.598e-10
#get anova results and save as purpose_recode_anova
purpose_recode_anova <- anova(purpose_recode_model)
# look at the class of purpose_recode_anova
class(purpose_recode_anova)
## [1] "anova" "data.frame"
#Use aov() to build purpose_recode_aov
purpose_recode_aov <- aov(funded_amnt ~ purpose_recode, data = lendingclub)
#Conduct Tukey's HSD test to create tukey_output
tukey_output <- TukeyHSD(purpose_recode_aov)
#tidy tukey_output to make sense of the results
broom::tidy(tukey_output)
## # A tibble: 15 x 6
## term comparison estimate conf.low conf.high adj.p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 purpose_rec~ debt_related-big_purcha~ 5434. 1808. 9059. 2.91e-4
## 2 purpose_rec~ home_related-big_purcha~ 4845. 562. 9128. 1.61e-2
## 3 purpose_rec~ life_change-big_purchase 4095. -2174. 10365. 4.25e-1
## 4 purpose_rec~ other-big_purchase -649. -5210. 3911. 9.99e-1
## 5 purpose_rec~ renewable_energy-big_pu~ -1796. -15902. 12309. 9.99e-1
## 6 purpose_rec~ home_related-debt_relat~ -589. -3056. 1879. 9.84e-1
## 7 purpose_rec~ life_change-debt_related -1338. -6539. 3863. 9.78e-1
## 8 purpose_rec~ other-debt_related -6083. -9005. -3160. 5.32e-8
## 9 purpose_rec~ renewable_energy-debt_r~ -7230. -20894. 6434. 6.58e-1
## 10 purpose_rec~ life_change-home_related -750. -6429. 4929. 9.99e-1
## 11 purpose_rec~ other-home_related -5494. -9201. -1787. 3.58e-4
## 12 purpose_rec~ renewable_energy-home_r~ -6641. -20494. 7212. 7.46e-1
## 13 purpose_rec~ other-life_change -4745. -10636. 1147. 1.95e-1
## 14 purpose_rec~ renewable_energy-life_c~ -5892. -20482. 8698. 8.59e-1
## 15 purpose_rec~ renewable_energy-other -1147. -15088. 12794. 1.00e+0
#Use aov() to build purpose_emp_aov
purpose_emp_aov <- aov(funded_amnt ~ purpose_recode + emp_length, data=lendingclub)
#print purpose_emp_aov to the console
purpose_emp_aov
## Call:
## aov(formula = funded_amnt ~ purpose_recode + emp_length, data = lendingclub)
##
## Terms:
## purpose_recode emp_length Residuals
## Sum of Squares 3688783338 2044273211 100488872355
## Deg. of Freedom 5 11 1483
##
## Residual standard error: 8231.679
## Estimated effects may be unbalanced
#call summary() to see the p-values
summary(purpose_emp_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## purpose_recode 5 3.689e+09 737756668 10.888 2.63e-10 ***
## emp_length 11 2.044e+09 185843019 2.743 0.00161 **
## Residuals 1483 1.005e+11 67760534
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#examine the summary of int_rate
summary(lendingclub$int_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.32 9.99 12.99 13.31 16.29 26.77
#examine int_rate by grade
lendingclub %>%
group_by(grade) %>%
summarise(mean = mean(int_rate), var = var(int_rate), median = median(int_rate))
## # A tibble: 7 x 4
## grade mean var median
## <chr> <dbl> <dbl> <dbl>
## 1 A 7.27 0.961 7.26
## 2 B 10.9 2.08 11.0
## 3 C 14.0 1.42 14.0
## 4 D 17.4 1.62 17.6
## 5 E 20.1 2.71 20.0
## 6 F 23.6 2.87 23.5
## 7 G 26.1 0.198 25.9
#make a boxplot of int_rate by grade
ggplot(lendingclub, aes(x = grade, y = int_rate)) + geom_boxplot()
#use aov() to create grade_aov plus call summary() to print results
grade_aov <- aov(int_rate ~ grade, data = lendingclub)
summary(grade_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## grade 6 27013 4502 2637 <2e-16 ***
## Residuals 1493 2549 2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#for a 2x2 grid of plots:
par(mfrow=c(2, 2))
#plot grade_aov
plot(grade_aov)
#back to defaults
par(mfrow=c(1, 1))
#Bartlett's test for homogeneity of variance
bartlett.test(int_rate ~ grade, data=lendingclub)
##
## Bartlett test of homogeneity of variances
##
## data: int_rate by grade
## Bartlett's K-squared = 78.549, df = 6, p-value = 7.121e-15
#use the correct function from pwr to find the sample size
pwr::pwr.t.test(n=NULL, d=0.2, sig.level=0.05,
type="two.sample", alternative="two.sided", power=0.8
)
##
## Two-sample t test power calculation
##
## n = 393.4057
## d = 0.2
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
lc_A <- c(11976148, 1203719, 54998739, 5801830, 31587242, 7711391, 54494666, 57663583, 8967787, 21760921, 44765721, 8596988, 5794746, 59501253, 10578432, 36058744, 11727607, 357888, 51936863, 1178593, 57315811, 5705168, 46024211, 12947039, 57345207, 55299831, 28763037, 49763149, 20077511, 60216198, 12295190, 1570287, 61408414, 59121340, 32349527, 5773180, 26899704, 55412161, 2217935, 16462713, 9196065, 27802028, 40949245, 56007625, 56935379, 62187473, 20178048, 604912, 58533358, 652594, 44066849, 38942161, 6414816, 65617953, 51816492, 43489983, 6794967, 42345315, 59532019, 13107597, 63249029, 7371829, 12335467, 8560739, 7337238, 887484, 23493355, 41031080, 60537197, 12816159, 38446687, 51026618, 6374688, 18685270, 296645, 44439325, 4915968, 63449566, 25256236, 63407874, 36753301, 20728660, 7937228, 13058684, 636359, 50527238, 40450502, 1018943, 12438198, 3065732, 1510626, 5764344, 37840363, 27460227, 39751366, 5028066, 43956700, 56109033, 1412622, 44289534, 41770436, 49956562, 44409121, 47168726, 60953428, 52189251, 64281487, 51928150, 1002880, 4537354, 12605849, 477843, 6808167, 38629237, 33311208, 36109419, 58593881, 40362979, 440300, 9848361, 30656060, 15691500, 4375269, 15360849, 7077904, 66076532, 33350264, 4175651, 44006939, 21130605, 54098234, 53192890, 7371114, 12967808, 58061230, 34803392, 5544911, 28843825, 63244663, 38504887, 68565204, 1211255, 63427670, 56472411, 10548622, 43957279, 59313014, 5768723, 66210490, 25507112, 55472659, 61339767, 65684813, 45544639, 43710238, 46833245, 13028661, 13167268, 3064642, 62072249, 27631726, 65825964, 15540990, 64320858, 8605358, 17795606, 9894584, 543619, 2380700, 20959552, 57743104, 63917130, 38480348, 61393540, 19916851)
lc_A <- c(lc_A, 12528162, 7264617, 61480809, 36411752, 20139228, 21290880, 390228, 45584424, 17755019, 23413261, 15490914, 1254285, 875004, 24274579, 51006600, 11458143, 5125832, 37802077, 57327243, 41059894, 64978360, 58683523, 4290736, 40919379, 65029207, 7096004, 42285591, 7388784, 65914238, 46833088, 21221678, 62855006, 10557733, 44915714, 23083224, 67289213, 9746670, 349608, 66610322, 1595886, 3635144, 38419356, 9715410, 9726377, 621152, 23213635, 18685424, 65782663, 57304429, 20770003, 8865120, 58664359, 1454540, 42404539, 60952405, 61339308, 7367648, 11215938, 41207320, 23553299, 1681376, 7617266, 30485630, 10604792, 46044414, 63094909, 59189668, 10106916, 52058386, 17763104, 6396213, 8981232, 48070364, 10615808, 11956507, 38444903, 60216940, 58310439, 10099562, 7504691, 17533228, 62236540, 38626163, 55657128, 7728107, 42415348, 42454693, 4777573, 23834164, 25157042, 1339435, 50587486, 55998961, 32950014, 28422748, 492346, 50607472, 11335041, 4254623, 65058537, 5375256, 5646680, 44430975, 4054992, 55253292, 68375791, 16822421, 64978226, 59859214, 65424555, 10112206, 6908772, 67879649, 4794842, 31227479, 17423361, 64049774, 58624386, 14829134, 50233873, 44389635, 29684724, 452267, 43044890, 55942742, 19516366, 34443897, 57135665, 34392172, 17352839, 12896521, 40451807, 43255228, 40372428, 8568706, 68364520, 3486848, 40991148, 19196658, 8658538, 65885614, 38352455, 65674149, 1029473, 39290483, 47420355, 65364529, 32318884, 13115811, 48484348, 65975356, 56129109, 3378980, 31026386, 55231010, 41113253, 1480114, 51406116, 2445051, 8627441, 60942818, 55453270, 58573102, 25767158, 9655554, 49783137, 42273770, 32038806, 681948, 65059359, 48546050, 20169281, 68546780, 7065575, 46387142, 66180493, 58430918, 1390497, 41950574, 39888056, 11774847, 55308824, 51969105, 7936525, 5960208, 7700566, 14529825, 14688918, 43024566, 21110140, 55797803, 31236439, 6817136, 1467168, 36028128, 60781310, 66595886, 57548184, 3194733, 8589175, 1546517, 17654773, 40572454, 63284984, 5780985, 39660177, 64050493, 55081623, 51346675, 1235123, 65633931, 66390924, 17413278, 57950994, 55911330, 11814853, 31357211, 56038385, 40038565, 64400706, 35034758, 60296238, 6527713, 5685238, 1062701, 63406447, 64008930, 63476297, 5114652, 20060374, 10085133, 61328568, 9435001, 56057656, 49934674, 39661404, 19616499, 34342717, 46653815, 45614269, 59290211, 31296803, 50605437, 46928301, 58562582, 63879452, 65733359, 51086476, 40601201, 9845217, 29213549, 41227222, 7337659, 46517072, 38610653, 9694813, 21350102, 46716202, 50535150, 39729407, 22263578, 25987787, 64913590, 19636684, 59311687, 4295372, 571012, 20588847, 63424767, 1099384, 3810242, 5604591, 39760687, 43739869, 56019939, 51526987, 45494853, 4302122, 21009984, 66210827, 67255219, 46613149, 63345017, 43570211, 62002161, 2214708, 4234697, 51055338, 19647002, 28593783, 6804647, 40542044, 42263319, 4784593, 19636686, 44015285, 55697847, 5814660, 15409525, 2307393, 54404433, 15490230, 62245810, 64969544, 48120716, 41040511, 51176224, 6376426, 60386775, 826517, 27601385, 8185587, 28564285, 68613325, 58623041, 60941473, 1635691, 7729270, 46417835, 57285778, 55960993, 66510262, 60285691, 61902329, 68565071)
lc_B <- c(62012715, 49974687, 27570947, 63417796, 61449107, 12906517, 57074291, 21021086, 404854, 15139172, 46774978, 50486061, 4305577, 65783354, 48544529, 31667129, 36980133, 19117791, 3845908, 846821, 40381968, 64018601, 57184860, 49963980, 44142706, 6327771, 20811335, 67336862, 3628833, 31247310, 4764984, 1619549, 56492219, 67959628, 61672211, 1472227, 55268407, 13497237, 57538143, 43096178, 35723158, 226780, 2307012, 1210773, 50273799, 28903599, 50839792, 44916418, 9714937, 51876659, 3919804, 12968154, 54978278, 6938022, 53854432, 63350177, 39692948, 67216234, 22253060, 59099446, 46135199, 11717805, 48596572, 8475061, 61462130, 21480483, 2014943, 41430440, 43196143, 243173, 61543762, 66562164, 67878273, 41100627, 11915326, 28753020, 12617369, 59090559, 55583726, 31256585, 544537, 61430245, 1681767, 7670078, 38506546, 36500594, 31367711, 46694948, 2080069, 38457330, 54524836, 27651989, 63358477, 62002922, 8995111, 45694307, 61470409, 17933815, 27370082, 66612753, 1536521, 54948920, 57548472, 876991, 40127147, 57365210, 1904740, 3195692, 743529, 67408356, 8766184, 23643466, 51336378, 13397002, 3700020, 49935259, 38455198, 63506356, 11386690, 32479126, 6300017, 67427011, 63344398, 51366616, 727247, 59291548, 21551336, 8776003, 16111335, 1051513, 61973285, 60764833, 59190150, 25406927, 10138072, 61361677, 32279884, 63337618, 49933340, 30565592, 3217416, 61883095, 63436296, 58290318, 29884855, 50353289, 14699170, 67625637, 6815821, 2286867, 6274586, 17853756, 55948157, 6995898, 44126015, 66643915, 41338910, 8626219, 67858810, 38597465, 45884338, 565018, 46436141, 15259622, 6594706, 39479497, 5535388, 5855546, 48734782, 2896555, 67296211, 713979, 33110251, 8987918, 1224687, 5637315, 484473, 9814600, 29694710, 60902260, 25897153, 40705483, 1439301, 3055155, 26319992, 6245002, 66441896, 46427698, 36330836, 8915199, 46205024, 62459417, 3497439, 54888931, 30475522, 38998249, 12636103, 60536957)
lc_B <- c(lc_B, 27521279, 2365984, 361549, 43430210, 35843833, 9768308, 12705933, 59179388, 60830121, 67929084, 36138408, 854552, 8865548, 13096420, 23836169, 61502149, 1621627, 11426617, 48274995, 41123011, 7296181, 29635336, 30565882, 8145149, 46116481, 21119590, 43894290, 65866235, 44143687, 873468, 12419378, 26378681, 55140334, 56964922, 61682200, 14338072, 65047247, 57267246, 59581503, 41093708, 48524124, 513842, 1685090, 42723216, 60647576, 55341080, 9735578, 41110083, 30255415, 56010965, 63214550, 67828966, 671468, 38540004, 65107371, 18645038, 26017706, 660734, 573283, 9454644, 64017354, 617449, 7645594, 43286428, 55941273, 8636865, 31226902, 46194753, 6160505, 1412225, 65741544, 24084859, 58532795, 41880754, 45515321, 60585561, 65272380, 7937327, 1489732, 17553239, 7638498, 1473206, 38162164, 3355990, 15610681, 57025137, 6254978, 38162571, 52768311, 5938741, 58101279, 18895673, 30175739, 38222417, 55909312, 65663878, 6607837, 24725076, 61722475, 11895058, 28182084, 185962, 55259655, 16241080, 66602227, 5781939, 60801476, 6996130, 12346893, 65672013, 19076244, 1475379, 9056893, 59492895, 56864322, 60942704, 44015940, 62225220, 39739191, 66435524, 44199929, 59471139, 38547168, 6205030, 38615829, 6698930, 66514563, 1623685, 60545969, 46703319, 39739315, 12636426, 65364691, 16403147, 9204637, 19306532, 66270322, 65653692, 22313524, 59082682, 19796545, 10766253, 50436003, 49363132, 27600713, 44865530, 57763719, 47857115, 48535477, 65986020, 58603818, 42934257, 1167844, 66390187, 58281312, 63888770, 48596526, 67385135, 24775459, 55090096, 12347068, 37317537, 64007908, 1683908, 11976597, 41019342, 6855113, 7964638, 65701227, 44037648, 23133074, 9787718, 61389384, 38418035, 33130454, 13038119, 14639242, 38505864, 65725266, 62904623, 68513661, 36039498, 6538734, 51857455, 59139740, 64341225, 21430833, 55455899, 17795459, 65128493, 46428798, 43216120, 59199242, 50364311, 41079485, 27711293, 63218354, 65492649, 50819365, 40737432, 377507, 65736437, 61488876, 44886450, 31467727, 46651816, 11914779, 65352381, 24726593, 52989922, 43105128, 34322310, 8669148, 12795739, 38485516, 39559934, 4280915, 63437401, 7103037, 44946049, 15400322, 28583975, 59592185, 877645, 56019484, 3372858, 60556772, 19846532, 11658194, 6894823, 61414862, 52708301, 48806212, 12204849, 60863986, 3919883, 37661631, 47210580, 14689912, 23393084, 60961679, 6170889, 55191727, 14690280, 42415518, 65855022, 62156039, 38536464, 44603544, 63527328, 48182146, 25867085, 61952845, 4744682, 20110370, 65854766, 57722242, 11438361, 34111919, 53262232, 12247443, 64210396, 37630339, 41237564, 46722148, 65791211, 16882760, 7719304, 37622016, 3220774, 51906280, 12446784, 50064210, 57733299, 63437152, 38445791, 3730324, 56052115, 57354312, 58010576, 626701, 7224706, 64079786, 62167132, 8396526, 7625377, 12707224, 35084508, 56022111, 52027979, 43215589, 50425264, 59253209, 28312549, 67376619, 30795837, 43869662, 20849433, 55351366, 39549686, 22972745, 1025579)
# The specific member IDs in lc_A and lc_B are not in dataset lendingclub
lendingclub_ab <- lendingclub %>%
mutate(Group=ifelse(member_id %in% lc_A, "A", ifelse(member_id %in% lc_B, "B", "C")))
# ggplot(lendingclub_ab, aes(x=Group, y=loan_amnt)) + geom_boxplot()
#conduct a two-sided t-test
# t.test(loan_amnt ~ Group, data=lendingclub_ab)
#build lendingclub_multi
# lendingclub_multi <-lm(loan_amnt ~ Group + grade + verification_status, data=lendingclub_ab)
#examine lendingclub_multi results
# broom::tidy(lendingclub_multi)
Chapter 3 - Randomized Complete (and Balanced Incomplete) Block Designs
Intro to NHANES Dataset and Sampling:
Randomized Complete Block Designs (RCBD):
Balanced Incomplete Block Designs (BIBD):
Example code includes:
nhanes_demo <- readr::read_csv("./RInputFiles/nhanes_demo.csv")
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
nhanes_medical <- readr::read_csv("./RInputFiles/nhanes_medicalconditions.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## mcq230d = col_logical(),
## mcq240b = col_logical(),
## mcq240c = col_logical(),
## mcq240d = col_logical(),
## mcq240dk = col_logical(),
## mcq240h = col_logical(),
## mcq240i = col_logical(),
## mcq240k = col_logical(),
## mcq240q = col_logical(),
## mcq240r = col_logical(),
## mcq240s = col_logical(),
## mcq240v = col_logical(),
## mcq240y = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 34 parsing failures.
## row col expected actual file
## 1510 mcq240s 1/0/T/F/TRUE/FALSE 41 './RInputFiles/nhanes_medicalconditions.csv'
## 1900 mcq240s 1/0/T/F/TRUE/FALSE 53 './RInputFiles/nhanes_medicalconditions.csv'
## 1982 mcq240q 1/0/T/F/TRUE/FALSE 80 './RInputFiles/nhanes_medicalconditions.csv'
## 3132 mcq240q 1/0/T/F/TRUE/FALSE 56 './RInputFiles/nhanes_medicalconditions.csv'
## 3452 mcq240c 1/0/T/F/TRUE/FALSE 69 './RInputFiles/nhanes_medicalconditions.csv'
## .... ....... .................. ...... ............................................
## See problems(...) for more details.
nhanes_bodymeasures <- readr::read_csv("./RInputFiles/nhanes_bodymeasures.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## bmihead = col_logical()
## )
## See spec(...) for full column specifications.
dummy_nhanes_final <- readr::read_csv("./RInputFiles/nhanes_final.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## mcq230d = col_logical(),
## mcq240b = col_logical(),
## mcq240c = col_logical(),
## mcq240d = col_logical(),
## mcq240dk = col_logical(),
## mcq240h = col_logical(),
## mcq240i = col_logical(),
## mcq240k = col_logical(),
## mcq240q = col_logical(),
## mcq240r = col_logical(),
## mcq240s = col_logical(),
## mcq240v = col_logical(),
## mcq240y = col_logical(),
## bmxhead = col_logical(),
## bmihead = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 33 parsing failures.
## row col expected actual file
## 1443 mcq240s 1/0/T/F/TRUE/FALSE 41 './RInputFiles/nhanes_final.csv'
## 1819 mcq240s 1/0/T/F/TRUE/FALSE 53 './RInputFiles/nhanes_final.csv'
## 1897 mcq240q 1/0/T/F/TRUE/FALSE 80 './RInputFiles/nhanes_final.csv'
## 3000 mcq240q 1/0/T/F/TRUE/FALSE 56 './RInputFiles/nhanes_final.csv'
## 3892 mcq240s 1/0/T/F/TRUE/FALSE 25 './RInputFiles/nhanes_final.csv'
## .... ....... .................. ...... ................................
## See problems(...) for more details.
#merge the 3 datasets you just created to create nhanes_combined
nhanes_combined <- list(nhanes_demo, nhanes_medical, nhanes_bodymeasures) %>%
Reduce(function(df1, df2) inner_join(df1, df2, by="seqn"), .)
#fill in the dplyr code
nhanes_combined %>% group_by(mcq365d) %>% summarise(mean = mean(bmxwt, na.rm = TRUE))
## # A tibble: 4 x 2
## mcq365d mean
## <dbl> <dbl>
## 1 1 90.7
## 2 2 76.5
## 3 9 90.8
## 4 NA 33.5
#fill in the ggplot2 code
nhanes_combined %>% filter(ridageyr > 16) %>%
ggplot(aes(x=as.factor(mcq365d), y=bmxwt)) +
geom_boxplot()
## Warning: Removed 70 rows containing non-finite values (stat_boxplot).
#filter out anyone less than 16
nhanes_filter <- nhanes_combined %>% filter(ridageyr > 16)
#use simputation & impute bmxwt to fill in missing values
nhanes_final <- simputation::impute_median(nhanes_filter, bmxwt ~ riagendr)
#recode mcq365d with ifelse() & examine with table()
nhanes_final$mcq365d <- ifelse(nhanes_final$mcq365d==9, 2, nhanes_final$mcq365d)
table(nhanes_final$mcq365d)
##
## 1 2
## 1802 4085
#use sample() to create nhanes_srs
nhanes_srs <- nhanes_final[sample(nrow(nhanes_final), 2500), ]
#create nhanes_stratified with group_by() and sample_n()
nhanes_stratified <- nhanes_final %>%
group_by(riagendr) %>%
sample_n(2000)
table(nhanes_stratified$riagendr)
##
## 1 2
## 2000 2000
#load sampling package and create nhanes_cluster with cluster()
nhanes_cluster <- sampling::cluster(nhanes_final, "indhhin2", 6, method = "srswor")
#use str() to view design.rcbd's criteria
str(agricolae::design.rcbd)
## function (trt, r, serie = 2, seed = 0, kinds = "Super-Duper", first = TRUE,
## continue = FALSE, randomization = TRUE)
#build trt and rep
trt <- LETTERS[1:5]
rep <- 4
#Use trt and rep to build my.design.rcbd and view the sketch part of the object
my_design_rcbd <- agricolae::design.rcbd(trt, r=rep, seed = 42, serie=0)
my_design_rcbd$sketch
## [,1] [,2] [,3] [,4] [,5]
## [1,] "D" "E" "A" "C" "B"
## [2,] "B" "C" "A" "E" "D"
## [3,] "C" "D" "A" "E" "B"
## [4,] "A" "C" "B" "D" "E"
#make nhanes_final$riagendr a factor variable
nhanes_final$riagendr <- factor(nhanes_final$riagendr)
#use aov() to create nhanes_rcbd
nhanes_rcbd <- aov(bmxwt ~ mcq365d + riagendr, data=nhanes_final)
#check the results of nhanes_rcbd with summary()
summary(nhanes_rcbd)
## Df Sum Sq Mean Sq F value Pr(>F)
## mcq365d 1 229164 229164 571.2 <2e-16 ***
## riagendr 1 163069 163069 406.4 <2e-16 ***
## Residuals 5884 2360774 401
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#print the difference in weights by mcq365d and riagendr
nhanes_final %>% group_by(mcq365d, riagendr) %>% summarise(mean_wt = mean(bmxwt))
## # A tibble: 4 x 3
## # Groups: mcq365d [2]
## mcq365d riagendr mean_wt
## <dbl> <fct> <dbl>
## 1 1 1 95.2
## 2 1 2 86.6
## 3 2 1 82.7
## 4 2 2 71.3
#set up the 2x2 plotting grid and then plot nhanes_rcbd
par(mfrow=c(2, 2))
plot(nhanes_rcbd)
par(mfrow=c(1, 1))
#run the code to view the interaction plots
with(nhanes_final, interaction.plot(mcq365d, riagendr, bmxwt))
#run the code to view the interaction plots
with(nhanes_final, interaction.plot(riagendr, mcq365d, bmxwt))
#create my_design_bibd_1
# my_design_bibd_1 <- design.bib(LETTERS[1:3], k = 4, r = 16, serie = 0, seed = 42) # will throw an error
#create my_design_bibd_2
# my_design_bibd_2 <- design.bib(letters[1:2], k = 3, r = 5, serie = 0, seed = 42) # will throw warning
#create my_design_bibd_3
my_design_bibd_3 <- agricolae::design.bib(letters[1:4], k = 4, r = 6, serie = 0, seed = 42)
##
## Parameters BIB
## ==============
## Lambda : 6
## treatmeans : 4
## Block size : 4
## Blocks : 6
## Replication: 6
##
## Efficiency factor 1
##
## <<< Book >>>
my_design_bibd_3$sketch
## [,1] [,2] [,3] [,4]
## [1,] "d" "b" "a" "c"
## [2,] "d" "c" "b" "a"
## [3,] "c" "d" "b" "a"
## [4,] "a" "b" "d" "c"
## [5,] "b" "d" "a" "c"
## [6,] "a" "b" "d" "c"
lambda <- function(t, k, r){
return((r*(k-1)) / (t-1))
}
#calculate lambda
lambda(4, 3, 3)
## [1] 2
#build the data.frame
creatinine <- c(1.98, 1.97, 2.35, 2.09, 1.87, 1.95, 2.08, 2.01, 1.84, 2.06, 1.97, 2.22)
food <- as.factor(c("A", "C", "D", "A", "B", "C", "B", "C", "D", "A", "B", "D"))
color <- as.factor(rep(c("Black", "White", "Orange", "Spotted"), each = 3))
cat_experiment <- as.data.frame(cbind(creatinine, food, color))
#create cat_model & then wrong_cat_model and examine them with summary()
cat_model <- aov(creatinine ~ food + color, data=cat_experiment)
summary(cat_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## food 1 0.01204 0.012042 0.530 0.485
## color 1 0.00697 0.006971 0.307 0.593
## Residuals 9 0.20461 0.022735
#calculate lambda
lambda(3, 3, 2)
## [1] 2
#create weightlift_model & examine results (variable does not exist in dataset)
# weightlift_model <- aov(bmxarmc ~ weightlift_treat + ridreth1, data=nhanes_final)
# summary(weightlift_model)
Chapter 4 - Latin Squares, Graeco-Latin Squares, Factorial Experiments
Latin Squares have two blocking factors, assumed not to interact with each other or the treatment, and each with the same number of levels:
Graeco-Latin Squares builds on Latin squares by adding an additional blocking factor:
Factorial Experiments - designs in which 2+ variables are crossed in an experiment, with each combination considered a factor:
Next steps:
Example code includes:
nyc_scores <- readr::read_csv("./RInputFiles/nyc_scores.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## School_ID = col_character(),
## School_Name = col_character(),
## Borough = col_character(),
## Building_Code = col_character(),
## Street_Address = col_character(),
## City = col_character(),
## State = col_character(),
## Phone_Number = col_character(),
## Start_Time = col_time(format = ""),
## End_Time = col_time(format = "")
## )
## See spec(...) for full column specifications.
glimpse(nyc_scores)
## Observations: 435
## Variables: 22
## $ School_ID <chr> "02M260", "06M211", "01M539", "02M294", "...
## $ School_Name <chr> "Clinton School Writers and Artists", "In...
## $ Borough <chr> "Manhattan", "Manhattan", "Manhattan", "M...
## $ Building_Code <chr> "M933", "M052", "M022", "M445", "M445", "...
## $ Street_Address <chr> "425 West 33rd Street", "650 Academy Stre...
## $ City <chr> "Manhattan", "Manhattan", "Manhattan", "M...
## $ State <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY",...
## $ Zip_Code <dbl> 10001, 10002, 10002, 10002, 10002, 10002,...
## $ Latitude <dbl> 40.75321, 40.86605, 40.71873, 40.71687, 4...
## $ Longitude <dbl> -73.99786, -73.92486, -73.97943, -73.9895...
## $ Phone_Number <chr> "212-695-9114", "718-935-3660", "212-677-...
## $ Start_Time <time> NA, 08:30:00, 08:15:00, 08:00:00, ...
## $ End_Time <time> NA, 15:00:00, 16:00:00, 14:45:00, ...
## $ Student_Enrollment <dbl> NA, 87, 1735, 358, 383, 416, 255, 545, 32...
## $ Percent_White <dbl> NA, 0.03, 0.29, 0.12, 0.03, 0.02, 0.04, 0...
## $ Percent_Black <dbl> NA, 0.22, 0.13, 0.39, 0.28, 0.03, 0.24, 0...
## $ Percent_Hispanic <dbl> NA, 0.68, 0.18, 0.41, 0.57, 0.06, 0.57, 0...
## $ Percent_Asian <dbl> NA, 0.05, 0.39, 0.06, 0.09, 0.89, 0.13, 0...
## $ Average_Score_SAT_Math <dbl> NA, NA, 657, 395, 418, 613, 410, 634, 389...
## $ Average_Score_SAT_Reading <dbl> NA, NA, 601, 411, 428, 453, 406, 641, 395...
## $ Average_Score_SAT_Writing <dbl> NA, NA, 601, 387, 415, 463, 381, 639, 381...
## $ Percent_Tested <dbl> NA, NA, 0.91, 0.79, 0.65, 0.96, 0.60, 0.7...
tEL <- c('PhD', 'BA', 'BA', 'MA', 'MA', 'PhD', 'MA', 'MA', 'BA', 'PhD', 'College Student', 'College Student', 'Grad Student', 'MA', 'MA', 'MA', 'BA', 'MA', 'BA', 'MA', 'College Student', 'PhD', 'MA', 'MA', 'BA', 'MA', 'College Student', 'BA', 'PhD', 'Grad Student', 'MA', 'Grad Student', 'MA', 'College Student', 'Grad Student', 'MA', 'Grad Student', 'BA', 'BA', 'College Student', 'Grad Student', 'College Student', 'BA', 'BA', 'PhD', 'BA', 'Grad Student', 'Grad Student', 'College Student', 'College Student', 'BA', 'PhD', 'College Student', 'PhD', 'PhD', 'PhD', 'College Student', 'Grad Student', 'MA', 'MA', 'BA', 'PhD', 'College Student', 'MA', 'MA', 'College Student', 'Grad Student', 'MA', 'PhD', 'MA', 'College Student', 'MA', 'PhD', 'MA', 'College Student', 'College Student', 'Grad Student', 'PhD', 'MA', 'MA', 'Grad Student', 'MA', 'MA', 'Grad Student', 'PhD', 'Grad Student', 'Grad Student', 'Grad Student', 'MA', 'PhD', 'BA', 'MA', 'Grad Student', 'BA', 'College Student', 'MA', 'College Student', 'Grad Student', 'Grad Student', 'College Student', 'MA', 'BA', 'BA', 'MA', 'MA', 'Grad Student', 'MA', 'Grad Student', 'MA', 'Grad Student', 'College Student', 'College Student', 'College Student', 'MA', 'BA', 'Grad Student', 'Grad Student', 'MA', 'College Student', 'BA', 'Grad Student', 'MA', 'Grad Student', 'PhD', 'MA', 'MA', 'College Student', 'MA', 'College Student', 'PhD', 'College Student', 'MA', 'MA', 'MA', 'MA', 'College Student', 'MA', 'BA', 'MA', 'Grad Student', 'BA', 'MA', 'MA', 'Grad Student', 'MA', 'MA', 'College Student', 'MA', 'MA', 'BA', 'MA', 'College Student', 'Grad Student', 'College Student', 'MA', 'BA', 'MA', 'BA', 'College Student', 'Grad Student', 'Grad Student', 'Grad Student', 'Grad Student', 'Grad Student', 'MA', 'BA', 'MA', 'BA', 'College Student', 'MA', 'BA', 'MA', 'Grad Student', 'MA', 'PhD', 'MA', 'BA', 'Grad Student', 'MA', 'BA', 'BA', 'MA', 'BA', 'College Student', 'BA', 'MA', 'MA', 'BA', 'MA', 'College Student', 'BA', 'Grad Student', 'MA', 'BA', 'MA', 'MA', 'MA', 'BA', 'College Student', 'College Student')
tEL <- c(tEL, 'BA', 'Grad Student', 'BA', 'BA', 'MA', 'Grad Student', 'BA', 'MA', 'BA', 'PhD', 'MA', 'MA', 'MA', 'BA', 'College Student', 'PhD', 'BA', 'Grad Student', 'BA', 'College Student', 'BA', 'MA', 'College Student', 'MA', 'College Student', 'Grad Student', 'College Student', 'MA', 'PhD', 'BA', 'PhD', 'Grad Student', 'BA', 'BA', 'MA', 'MA', 'BA', 'PhD', 'College Student', 'MA', 'BA', 'College Student', 'BA', 'MA', 'College Student', 'MA', 'College Student', 'BA', 'MA', 'BA', 'BA', 'MA', 'PhD', 'BA', 'MA', 'Grad Student', 'College Student', 'MA', 'College Student', 'MA', 'MA', 'PhD', 'College Student', 'College Student', 'Grad Student', 'Grad Student', 'MA', 'College Student', 'Grad Student', 'Grad Student', 'Grad Student', 'MA', 'Grad Student', 'MA', 'BA', 'College Student', 'MA', 'Grad Student', 'College Student', 'MA', 'BA', 'BA', 'College Student', 'College Student', 'College Student', 'College Student', 'College Student', 'PhD', 'MA', 'College Student', 'MA', 'MA', 'MA', 'PhD', 'College Student', 'College Student', 'MA', 'MA', 'MA', 'PhD', 'MA', 'MA', 'PhD', 'MA', 'Grad Student', 'MA', 'Grad Student', 'MA', 'Grad Student', 'MA', 'MA', 'PhD', 'BA', 'BA', 'Grad Student', 'Grad Student', 'PhD', 'BA', 'BA', 'Grad Student', 'College Student', 'BA', 'College Student', 'MA', 'MA', 'MA', 'Grad Student', 'BA', 'BA', 'MA', 'Grad Student', 'PhD', 'BA', 'Grad Student', 'Grad Student', 'Grad Student', 'BA', 'MA', 'BA', 'College Student', 'College Student', 'Grad Student', 'MA', 'Grad Student', 'Grad Student', 'BA', 'BA', 'MA', 'College Student', 'BA', 'Grad Student', 'Grad Student', 'College Student', 'Grad Student', 'College Student', 'PhD', 'BA', 'MA', 'MA', 'BA', 'College Student', 'College Student', 'PhD', 'MA', 'BA', 'MA', 'MA', 'Grad Student', 'MA', 'PhD', 'MA', 'MA', 'Grad Student', 'College Student', 'MA', 'BA', 'BA', 'College Student', 'Grad Student', 'BA', 'MA', 'MA', 'Grad Student', 'BA', 'Grad Student', 'Grad Student', 'MA', 'PhD', 'Grad Student', 'Grad Student', 'MA', 'MA', 'PhD', 'College Student', 'College Student', 'MA', 'BA', 'MA', 'College Student', 'MA', 'PhD', 'BA', 'MA', 'College Student', 'PhD', 'PhD', 'College Student', 'MA', 'MA', 'MA', 'PhD', 'MA', 'BA', 'College Student', 'BA', 'BA', 'MA', 'MA', 'College Student', 'College Student', 'Grad Student', 'College Student', 'MA', 'MA', 'MA', 'Grad Student', 'MA', 'College Student', 'Grad Student', 'BA', 'Grad Student', 'BA', 'MA', 'College Student', 'MA')
nyc_scores <- nyc_scores %>%
mutate(Teacher_Education_Level=tEL)
glimpse(nyc_scores)
## Observations: 435
## Variables: 23
## $ School_ID <chr> "02M260", "06M211", "01M539", "02M294", "...
## $ School_Name <chr> "Clinton School Writers and Artists", "In...
## $ Borough <chr> "Manhattan", "Manhattan", "Manhattan", "M...
## $ Building_Code <chr> "M933", "M052", "M022", "M445", "M445", "...
## $ Street_Address <chr> "425 West 33rd Street", "650 Academy Stre...
## $ City <chr> "Manhattan", "Manhattan", "Manhattan", "M...
## $ State <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY",...
## $ Zip_Code <dbl> 10001, 10002, 10002, 10002, 10002, 10002,...
## $ Latitude <dbl> 40.75321, 40.86605, 40.71873, 40.71687, 4...
## $ Longitude <dbl> -73.99786, -73.92486, -73.97943, -73.9895...
## $ Phone_Number <chr> "212-695-9114", "718-935-3660", "212-677-...
## $ Start_Time <time> NA, 08:30:00, 08:15:00, 08:00:00, ...
## $ End_Time <time> NA, 15:00:00, 16:00:00, 14:45:00, ...
## $ Student_Enrollment <dbl> NA, 87, 1735, 358, 383, 416, 255, 545, 32...
## $ Percent_White <dbl> NA, 0.03, 0.29, 0.12, 0.03, 0.02, 0.04, 0...
## $ Percent_Black <dbl> NA, 0.22, 0.13, 0.39, 0.28, 0.03, 0.24, 0...
## $ Percent_Hispanic <dbl> NA, 0.68, 0.18, 0.41, 0.57, 0.06, 0.57, 0...
## $ Percent_Asian <dbl> NA, 0.05, 0.39, 0.06, 0.09, 0.89, 0.13, 0...
## $ Average_Score_SAT_Math <dbl> NA, NA, 657, 395, 418, 613, 410, 634, 389...
## $ Average_Score_SAT_Reading <dbl> NA, NA, 601, 411, 428, 453, 406, 641, 395...
## $ Average_Score_SAT_Writing <dbl> NA, NA, 601, 387, 415, 463, 381, 639, 381...
## $ Percent_Tested <dbl> NA, NA, 0.91, 0.79, 0.65, 0.96, 0.60, 0.7...
## $ Teacher_Education_Level <chr> "PhD", "BA", "BA", "MA", "MA", "PhD", "MA...
#mean, var, and median of Math score
nyc_scores %>%
group_by(Borough) %>%
summarise(mean = mean(Average_Score_SAT_Math, na.rm=TRUE),
var = var(Average_Score_SAT_Math, na.rm=TRUE),
median = median(Average_Score_SAT_Math, na.rm=TRUE))
## # A tibble: 5 x 4
## Borough mean var median
## <chr> <dbl> <dbl> <dbl>
## 1 Bronx 404. 2727. 396.
## 2 Brooklyn 416. 3658. 395
## 3 Manhattan 456. 7026. 433
## 4 Queens 462. 5168. 448
## 5 Staten Island 486. 6911. 466.
#mean, var, and median of Math score
nyc_scores %>%
group_by(Teacher_Education_Level) %>%
summarise(mean = mean(Average_Score_SAT_Math, na.rm=TRUE),
var = var(Average_Score_SAT_Math, na.rm=TRUE),
median = median(Average_Score_SAT_Math, na.rm=TRUE))
## # A tibble: 5 x 4
## Teacher_Education_Level mean var median
## <chr> <dbl> <dbl> <dbl>
## 1 BA 438. 5536. 418
## 2 College Student 424. 4807. 400.
## 3 Grad Student 437. 7071. 410
## 4 MA 432. 4304. 415
## 5 PhD 435. 4869. 420.
#mean, var, and median of Math score
nyc_scores %>%
group_by(Borough, Teacher_Education_Level) %>%
summarise(mean = mean(Average_Score_SAT_Math, na.rm=TRUE),
var = var(Average_Score_SAT_Math, na.rm=TRUE),
median = median(Average_Score_SAT_Math, na.rm=TRUE))
## # A tibble: 24 x 5
## # Groups: Borough [5]
## Borough Teacher_Education_Level mean var median
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Bronx BA 428. 7446. 407
## 2 Bronx College Student 384. 493. 382
## 3 Bronx Grad Student 400. 1776. 397
## 4 Bronx MA 401. 901. 395
## 5 Bronx PhD 400. 2468. 386
## 6 Brooklyn BA 425. 4005. 405
## 7 Brooklyn College Student 396. 3035. 382
## 8 Brooklyn Grad Student 436. 5756. 408
## 9 Brooklyn MA 414. 2401. 395
## 10 Brooklyn PhD 402 1868. 394.
## # ... with 14 more rows
# If we want to use SAT scores as our outcome, we need to examine their missingness
# First, look at the pattern of missingness using md.pattern() from the mice package
# There are 60 scores missing in each of the scores
# There are many R packages which help with more advanced forms of imputation, such as MICE, Amelia, mi, and more
# We will use the simputation andimpute_median() as we did previously
#examine missingness with md.pattern()
mice::md.pattern(nyc_scores)
## School_ID School_Name Borough Building_Code Street_Address City State
## 374 1 1 1 1 1 1 1
## 11 1 1 1 1 1 1 1
## 42 1 1 1 1 1 1 1
## 4 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0
## Zip_Code Latitude Longitude Phone_Number Teacher_Education_Level Start_Time
## 374 1 1 1 1 1 1
## 11 1 1 1 1 1 1
## 42 1 1 1 1 1 1
## 4 1 1 1 1 1 1
## 1 1 1 1 1 1 0
## 3 1 1 1 1 1 0
## 0 0 0 0 0 4
## End_Time Student_Enrollment Percent_White Percent_Black Percent_Hispanic
## 374 1 1 1 1 1
## 11 1 1 1 1 1
## 42 1 1 1 1 1
## 4 1 0 0 0 0
## 1 0 1 1 1 1
## 3 0 0 0 0 0
## 4 7 7 7 7
## Percent_Asian Percent_Tested Average_Score_SAT_Math
## 374 1 1 1
## 11 1 1 0
## 42 1 0 0
## 4 0 0 0
## 1 1 1 1
## 3 0 0 0
## 7 49 60
## Average_Score_SAT_Reading Average_Score_SAT_Writing
## 374 1 1 0
## 11 0 0 3
## 42 0 0 4
## 4 0 0 9
## 1 1 1 2
## 3 0 0 11
## 60 60 272
#impute the Math, Writing, and Reading scores by Borough
nyc_scores_2 <- simputation::impute_median(nyc_scores, Average_Score_SAT_Math ~ Borough)
#convert Math score to numeric
nyc_scores_2$Average_Score_SAT_Math <- as.numeric(nyc_scores_2$Average_Score_SAT_Math)
#examine scores by Borough in both datasets, before and after imputation
nyc_scores %>%
group_by(Borough) %>%
summarise(median = median(Average_Score_SAT_Math, na.rm = TRUE), mean = mean(Average_Score_SAT_Math, na.rm = TRUE))
## # A tibble: 5 x 3
## Borough median mean
## <chr> <dbl> <dbl>
## 1 Bronx 396. 404.
## 2 Brooklyn 395 416.
## 3 Manhattan 433 456.
## 4 Queens 448 462.
## 5 Staten Island 466. 486.
nyc_scores_2 %>%
group_by(Borough) %>%
summarise(median = median(Average_Score_SAT_Math, na.rm = TRUE), mean = mean(Average_Score_SAT_Math, na.rm = TRUE))
## # A tibble: 5 x 3
## Borough median mean
## <chr> <dbl> <dbl>
## 1 Bronx 396. 403.
## 2 Brooklyn 395 414.
## 3 Manhattan 433 452.
## 4 Queens 448 460.
## 5 Staten Island 466. 486.
#design a LS with 5 treatments A:E then look at the sketch
my_design_lsd <- agricolae::design.lsd(LETTERS[1:5], serie=0, seed=42)
my_design_lsd$sketch
## [,1] [,2] [,3] [,4] [,5]
## [1,] "B" "E" "D" "A" "C"
## [2,] "A" "D" "C" "E" "B"
## [3,] "E" "C" "B" "D" "A"
## [4,] "C" "A" "E" "B" "D"
## [5,] "D" "B" "A" "C" "E"
# To execute a Latin Square design on this data, suppose we want to know the effect of of our tutoring program, which includes one-on-one tutoring, two small groups, and an in and after school SAT prep class
# A new dataset nyc_scores_ls is available that represents this experiment. Feel free to explore the dataset in the console.
# We'll block by Borough and Teacher_Education_Level to reduce their known variance on the score outcome
# Borough is a good blocking factor because schools in America are funded partly based on taxes paid in each city, so it will likely make a difference on quality of education
lsID <- c('11X290', '10X342', '09X260', '09X412', '12X479', '14K478', '32K554', '14K685', '22K405', '17K382', '05M692', '02M427', '02M308', '03M402', '02M282', '30Q501', '26Q495', '24Q455', '29Q326', '25Q670', '31R450', '31R445', '31R080', '31R460', '31R455')
lsTP <- c('One-on-One', 'Small Groups (2-3)', 'Small Groups (4-6)', 'SAT Prep Class (school hours)', 'SAT Prep Class (after school)', 'Small Groups (2-3)', 'Small Groups (4-6)', 'SAT Prep Class (school hours)', 'SAT Prep Class (after school)', 'One-on-One', 'Small Groups (4-6)', 'SAT Prep Class (school hours)', 'SAT Prep Class (after school)', 'One-on-One', 'Small Groups (2-3)', 'SAT Prep Class (school hours)', 'SAT Prep Class (after school)', 'One-on-One', 'Small Groups (2-3)', 'Small Groups (4-6)', 'SAT Prep Class (after school)', 'One-on-One', 'Small Groups (2-3)', 'Small Groups (4-6)', 'SAT Prep Class (school hours)')
nyc_scores_ls <- nyc_scores_2 %>%
filter(School_ID %in% lsID) %>%
mutate(Tutoring_Program=lsTP)
#build nyc_scores_ls_lm
nyc_scores_ls_lm <- lm(Average_Score_SAT_Math ~ Tutoring_Program + Borough + Teacher_Education_Level,
data=nyc_scores_ls
)
#tidy the results with broom
nyc_scores_ls_lm %>% broom::tidy()
## # A tibble: 13 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 398. 70.6 5.64 1.09e-4
## 2 Tutoring_ProgramSAT Prep Class (after s~ -3.23 49.7 -0.0650 9.49e-1
## 3 Tutoring_ProgramSAT Prep Class (school ~ 17.7 63.3 0.279 7.85e-1
## 4 Tutoring_ProgramSmall Groups (2-3) -46.0 58.5 -0.787 4.47e-1
## 5 Tutoring_ProgramSmall Groups (4-6) -48.4 49.7 -0.973 3.50e-1
## 6 BoroughBrooklyn 59.9 59.5 1.01 3.34e-1
## 7 BoroughManhattan 55.5 53.0 1.05 3.16e-1
## 8 BoroughQueens 84.5 59.1 1.43 1.78e-1
## 9 BoroughStaten Island 54.0 49.2 1.10 2.93e-1
## 10 Teacher_Education_LevelCollege Student 16.3 81.0 0.201 8.44e-1
## 11 Teacher_Education_LevelGrad Student 71.9 64.3 1.12 2.86e-1
## 12 Teacher_Education_LevelMA 8.12 46.5 0.175 8.64e-1
## 13 Teacher_Education_LevelPhD -68.1 102. -0.671 5.15e-1
#examine the results with anova
nyc_scores_ls_lm %>% anova()
## Analysis of Variance Table
##
## Response: Average_Score_SAT_Math
## Df Sum Sq Mean Sq F value Pr(>F)
## Tutoring_Program 4 42101 10525.3 1.8387 0.1863
## Borough 4 13665 3416.3 0.5968 0.6719
## Teacher_Education_Level 4 15046 3761.6 0.6571 0.6332
## Residuals 12 68693 5724.4
#create a boxplot of Math scores by Borough, with a title and x/y axis labels
ggplot(nyc_scores, aes(x=Borough, y=Average_Score_SAT_Math)) +
geom_boxplot() +
ggtitle("Average SAT Math Scores by Borough, NYC") +
xlab("Borough (NYC)") +
ylab("Average SAT Math Scores (2014-15)")
## Warning: Removed 60 rows containing non-finite values (stat_boxplot).
#create trt1 and trt2
trt1 <- LETTERS[1:5]
trt2 <- 1:5
#create my_graeco_design
my_graeco_design <- agricolae::design.graeco(trt1, trt2, serie=0, seed=42)
#examine the parameters and sketch
my_graeco_design$parameters
## $design
## [1] "graeco"
##
## $trt1
## [1] "A" "B" "C" "D" "E"
##
## $trt2
## [1] 1 2 3 4 5
##
## $r
## [1] 5
##
## $serie
## [1] 0
##
## $seed
## [1] 42
##
## $kinds
## [1] "Super-Duper"
##
## [[8]]
## [1] TRUE
my_graeco_design$sketch
## [,1] [,2] [,3] [,4] [,5]
## [1,] "D 2" "E 3" "A 1" "C 5" "B 4"
## [2,] "E 1" "A 5" "C 4" "B 2" "D 3"
## [3,] "A 4" "C 2" "B 3" "D 1" "E 5"
## [4,] "C 3" "B 1" "D 5" "E 4" "A 2"
## [5,] "B 5" "D 4" "E 2" "A 3" "C 1"
glsID <- c('09X241', '10X565', '09X260', '07X259', '11X455', '18K563', '23K697', '32K403', '22K425', '16K688', '02M135', '06M348', '02M419', '02M489', '04M495', '30Q502', '24Q530', '30Q555', '24Q560', '27Q650', '31R440', '31R064', '31R450', '31R445', '31R460')
glsTP <- c('SAT Prep Class (school hours)', 'SAT Prep Class (after school)', 'One-on-One', 'Small Groups (4-6)', 'Small Groups (2-3)', 'SAT Prep Class (after school)', 'One-on-One', 'Small Groups (4-6)', 'Small Groups (2-3)', 'SAT Prep Class (school hours)', 'One-on-One', 'Small Groups (4-6)', 'Small Groups (2-3)', 'SAT Prep Class (school hours)', 'SAT Prep Class (after school)', 'Small Groups (4-6)', 'Small Groups (2-3)', 'SAT Prep Class (school hours)', 'SAT Prep Class (after school)', 'One-on-One', 'Small Groups (2-3)', 'SAT Prep Class (school hours)', 'SAT Prep Class (after school)', 'One-on-One', 'Small Groups (4-6)')
glsHT <- c('Small Group', 'Large Group', 'Individual', 'Mix of Large Group/Individual', 'Mix of Small Group/Individual', 'Individual', 'Mix of Large Group/Individual', 'Mix of Small Group/Individual', 'Small Group', 'Large Group', 'Mix of Small Group/Individual', 'Small Group', 'Large Group', 'Individual', 'Mix of Large Group/Individual', 'Large Group', 'Individual', 'Mix of Large Group/Individual', 'Mix of Small Group/Individual', 'Small Group', 'Mix of Large Group/Individual', 'Mix of Small Group/Individual', 'Small Group', 'Large Group', 'Individual')
nyc_scores_gls <- nyc_scores_2 %>%
filter(School_ID %in% glsID) %>%
mutate(Tutoring_Program=glsTP, Homework_Type=glsHT)
#build nyc_scores_gls_lm
nyc_scores_gls_lm <- lm(Average_Score_SAT_Math ~ Tutoring_Program + Borough + Teacher_Education_Level + Homework_Type, data=nyc_scores_gls)
#tidy the results with broom
nyc_scores_gls_lm %>% broom::tidy()
## # A tibble: 17 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 376. 32.1 11.7 2.61e-6
## 2 Tutoring_ProgramSAT Prep Class (after~ 43.4 27.2 1.60 1.49e-1
## 3 Tutoring_ProgramSAT Prep Class (schoo~ 28.2 29.3 0.960 3.65e-1
## 4 Tutoring_ProgramSmall Groups (2-3) 32.7 27.3 1.20 2.66e-1
## 5 Tutoring_ProgramSmall Groups (4-6) 56.1 27.5 2.04 7.51e-2
## 6 BoroughBrooklyn -21.5 28.9 -0.744 4.78e-1
## 7 BoroughManhattan 13.5 28.0 0.484 6.42e-1
## 8 BoroughQueens 41.8 24.8 1.68 1.31e-1
## 9 BoroughStaten Island 25.1 22.9 1.10 3.05e-1
## 10 Teacher_Education_LevelCollege Student 2.61 30.6 0.0853 9.34e-1
## 11 Teacher_Education_LevelGrad Student 40.2 40.1 1.00 3.46e-1
## 12 Teacher_Education_LevelMA 18.0 22.9 0.786 4.55e-1
## 13 Teacher_Education_LevelPhD 14.2 36.3 0.392 7.05e-1
## 14 Homework_TypeLarge Group -1.07 25.8 -0.0415 9.68e-1
## 15 Homework_TypeMix of Large Group/Indiv~ -15.6 26.4 -0.592 5.70e-1
## 16 Homework_TypeMix of Small Group/Indiv~ 3.28 25.0 0.131 8.99e-1
## 17 Homework_TypeSmall Group 47.8 28.3 1.69 1.29e-1
#examine the results with anova
nyc_scores_gls_lm %>% anova()
## Analysis of Variance Table
##
## Response: Average_Score_SAT_Math
## Df Sum Sq Mean Sq F value Pr(>F)
## Tutoring_Program 4 15371.5 3842.9 3.1570 0.07801 .
## Borough 4 5277.0 1319.3 1.0838 0.42563
## Teacher_Education_Level 4 2869.7 717.4 0.5894 0.67993
## Homework_Type 4 9738.7 2434.7 2.0002 0.18747
## Residuals 8 9737.9 1217.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pctTHL <- c(1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2)
pctBHL <- c(2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1)
tP <- c('Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No')
nyc_scores <- nyc_scores %>%
select(-Teacher_Education_Level) %>%
mutate(Percent_Tested_HL=factor(pctTHL), Percent_Black_HL=factor(pctBHL), Tutoring_Program=factor(tP))
#build the boxplots for all 3 factor variables: tutoring program, pct black, pct tested
ggplot(nyc_scores, aes(x=Tutoring_Program, y=Average_Score_SAT_Math)) +
geom_boxplot()
## Warning: Removed 60 rows containing non-finite values (stat_boxplot).
ggplot(nyc_scores, aes(x=Percent_Black_HL, y=Average_Score_SAT_Math)) +
geom_boxplot()
## Warning: Removed 60 rows containing non-finite values (stat_boxplot).
ggplot(nyc_scores, aes(x=Percent_Tested_HL, y=Average_Score_SAT_Math)) +
geom_boxplot()
## Warning: Removed 60 rows containing non-finite values (stat_boxplot).
#create nyc_scores_factorial and examine the results
nyc_scores_factorial <- aov(Average_Score_SAT_Math ~ Percent_Tested_HL * Percent_Black_HL * Tutoring_Program, data=nyc_scores)
broom::tidy(nyc_scores_factorial)
## # A tibble: 8 x 6
## term df sumsq meansq statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Percent_Tested_HL 1 392193. 3.92e5 116. 1.21e-23
## 2 Percent_Black_HL 1 187252. 1.87e5 55.3 7.52e-13
## 3 Tutoring_Program 1 9419. 9.42e3 2.78 9.63e- 2
## 4 Percent_Tested_HL:Percent_Black_HL 1 88839. 8.88e4 26.2 4.94e- 7
## 5 Percent_Tested_HL:Tutoring_Program 1 1978. 1.98e3 0.584 4.45e- 1
## 6 Percent_Black_HL:Tutoring_Program 1 5157. 5.16e3 1.52 2.18e- 1
## 7 Percent_Tested_HL:Percent_Black_HL:~ 1 7773. 7.77e3 2.29 1.31e- 1
## 8 Residuals 367 1243641. 3.39e3 NA NA
#use shapiro.test() to test the outcome
shapiro.test(nyc_scores$Average_Score_SAT_Math)
##
## Shapiro-Wilk normality test
##
## data: nyc_scores$Average_Score_SAT_Math
## W = 0.84672, p-value < 2.2e-16
#plot nyc_scores_factorial to examine residuals
par(mfrow = c(2, 2))
plot(nyc_scores_factorial)
par(mfrow = c(1, 1))
Chapter 1 - One-Factor Models
Model Specification - Structural Equation Models (SEM) - explore relationships between variables:
Model Analysis:
Model Assessment:
Example code includes:
#Load the lavaan library
library(lavaan)
## This is lavaan 0.6-5
## lavaan is BETA software! Please report any bugs.
#Look at the dataset
data(HolzingerSwineford1939, package="lavaan")
head(HolzingerSwineford1939[ , 7:15])
## x1 x2 x3 x4 x5 x6 x7 x8 x9
## 1 3.333333 7.75 0.375 2.333333 5.75 1.2857143 3.391304 5.75 6.361111
## 2 5.333333 5.25 2.125 1.666667 3.00 1.2857143 3.782609 6.25 7.916667
## 3 4.500000 5.25 1.875 1.000000 1.75 0.4285714 3.260870 3.90 4.416667
## 4 5.333333 7.75 3.000 2.666667 4.50 2.4285714 3.000000 5.30 4.861111
## 5 4.833333 4.75 0.875 2.666667 4.00 2.5714286 3.695652 6.30 5.916667
## 6 5.333333 5.00 2.250 1.000000 3.00 0.8571429 4.347826 6.65 7.500000
#Define your model specification
text.model <- "textspeed =~ x4 + x5 + x6 + x7 + x8 + x9"
#Analyze the model with cfa()
text.fit <- lavaan::cfa(model=text.model, data=HolzingerSwineford1939)
#Summarize the model
summary(text.fit)
## lavaan 0.6-5 ended normally after 20 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of free parameters 12
##
## Number of observations 301
##
## Model Test User Model:
##
## Test statistic 149.786
## Degrees of freedom 9
## P-value (Chi-square) 0.000
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|)
## textspeed =~
## x4 1.000
## x5 1.130 0.067 16.946 0.000
## x6 0.925 0.056 16.424 0.000
## x7 0.196 0.067 2.918 0.004
## x8 0.186 0.062 2.984 0.003
## x9 0.279 0.062 4.539 0.000
##
## Variances:
## Estimate Std.Err z-value P(>|z|)
## .x4 0.383 0.048 7.903 0.000
## .x5 0.424 0.059 7.251 0.000
## .x6 0.368 0.044 8.419 0.000
## .x7 1.146 0.094 12.217 0.000
## .x8 0.988 0.081 12.215 0.000
## .x9 0.940 0.077 12.142 0.000
## textspeed 0.968 0.112 8.647 0.000
summary(text.fit, standardized=TRUE)
## lavaan 0.6-5 ended normally after 20 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of free parameters 12
##
## Number of observations 301
##
## Model Test User Model:
##
## Test statistic 149.786
## Degrees of freedom 9
## P-value (Chi-square) 0.000
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## textspeed =~
## x4 1.000 0.984 0.846
## x5 1.130 0.067 16.946 0.000 1.112 0.863
## x6 0.925 0.056 16.424 0.000 0.910 0.832
## x7 0.196 0.067 2.918 0.004 0.193 0.177
## x8 0.186 0.062 2.984 0.003 0.183 0.181
## x9 0.279 0.062 4.539 0.000 0.275 0.273
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .x4 0.383 0.048 7.903 0.000 0.383 0.284
## .x5 0.424 0.059 7.251 0.000 0.424 0.256
## .x6 0.368 0.044 8.419 0.000 0.368 0.308
## .x7 1.146 0.094 12.217 0.000 1.146 0.969
## .x8 0.988 0.081 12.215 0.000 0.988 0.967
## .x9 0.940 0.077 12.142 0.000 0.940 0.926
## textspeed 0.968 0.112 8.647 0.000 1.000 1.000
summary(text.fit, fit.measures=TRUE)
## lavaan 0.6-5 ended normally after 20 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of free parameters 12
##
## Number of observations 301
##
## Model Test User Model:
##
## Test statistic 149.786
## Degrees of freedom 9
## P-value (Chi-square) 0.000
##
## Model Test Baseline Model:
##
## Test statistic 681.336
## Degrees of freedom 15
## P-value 0.000
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 0.789
## Tucker-Lewis Index (TLI) 0.648
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -2476.130
## Loglikelihood unrestricted model (H1) -2401.237
##
## Akaike (AIC) 4976.261
## Bayesian (BIC) 5020.746
## Sample-size adjusted Bayesian (BIC) 4982.689
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.228
## 90 Percent confidence interval - lower 0.197
## 90 Percent confidence interval - upper 0.261
## P-value RMSEA <= 0.05 0.000
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.148
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|)
## textspeed =~
## x4 1.000
## x5 1.130 0.067 16.946 0.000
## x6 0.925 0.056 16.424 0.000
## x7 0.196 0.067 2.918 0.004
## x8 0.186 0.062 2.984 0.003
## x9 0.279 0.062 4.539 0.000
##
## Variances:
## Estimate Std.Err z-value P(>|z|)
## .x4 0.383 0.048 7.903 0.000
## .x5 0.424 0.059 7.251 0.000
## .x6 0.368 0.044 8.419 0.000
## .x7 1.146 0.094 12.217 0.000
## .x8 0.988 0.081 12.215 0.000
## .x9 0.940 0.077 12.142 0.000
## textspeed 0.968 0.112 8.647 0.000
#Look at the dataset
data(PoliticalDemocracy, package="lavaan")
head(PoliticalDemocracy)
## y1 y2 y3 y4 y5 y6 y7 y8 x1
## 1 2.50 0.000000 3.333333 0.000000 1.250000 0.000000 3.726360 3.333333 4.442651
## 2 1.25 0.000000 3.333333 0.000000 6.250000 1.100000 6.666666 0.736999 5.384495
## 3 7.50 8.800000 9.999998 9.199991 8.750000 8.094061 9.999998 8.211809 5.961005
## 4 8.90 8.800000 9.999998 9.199991 8.907948 8.127979 9.999998 4.615086 6.285998
## 5 10.00 3.333333 9.999998 6.666666 7.500000 3.333333 9.999998 6.666666 5.863631
## 6 7.50 3.333333 6.666666 6.666666 6.250000 1.100000 6.666666 0.368500 5.533389
## x2 x3
## 1 3.637586 2.557615
## 2 5.062595 3.568079
## 3 6.255750 5.224433
## 4 7.567863 6.267495
## 5 6.818924 4.573679
## 6 5.135798 3.892270
#Define your model specification
politics.model <- "poldemo60 =~ y1 + y2 + y3 + y4"
#Analyze the model with cfa()
politics.fit <- lavaan::cfa(model = politics.model, data = PoliticalDemocracy)
#Summarize the model
summary(politics.fit, standardized=TRUE, fit.measures=TRUE)
## lavaan 0.6-5 ended normally after 26 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of free parameters 8
##
## Number of observations 75
##
## Model Test User Model:
##
## Test statistic 10.006
## Degrees of freedom 2
## P-value (Chi-square) 0.007
##
## Model Test Baseline Model:
##
## Test statistic 159.183
## Degrees of freedom 6
## P-value 0.000
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 0.948
## Tucker-Lewis Index (TLI) 0.843
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -704.138
## Loglikelihood unrestricted model (H1) -699.135
##
## Akaike (AIC) 1424.275
## Bayesian (BIC) 1442.815
## Sample-size adjusted Bayesian (BIC) 1417.601
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.231
## 90 Percent confidence interval - lower 0.103
## 90 Percent confidence interval - upper 0.382
## P-value RMSEA <= 0.05 0.014
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.046
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## poldemo60 =~
## y1 1.000 2.133 0.819
## y2 1.404 0.197 7.119 0.000 2.993 0.763
## y3 1.089 0.167 6.529 0.000 2.322 0.712
## y4 1.370 0.167 8.228 0.000 2.922 0.878
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .y1 2.239 0.512 4.371 0.000 2.239 0.330
## .y2 6.412 1.293 4.960 0.000 6.412 0.417
## .y3 5.229 0.990 5.281 0.000 5.229 0.492
## .y4 2.530 0.765 3.306 0.001 2.530 0.229
## poldemo60 4.548 1.106 4.112 0.000 1.000 1.000
Chapter 2 - Multi-Factor Models
Multifactor Specification - exploring multiple latent relationships, and their relationships to each other:
Model Structure:
Modification Indices:
Model Comparison:
Example code includes:
#Create your text model specification
text.model <- 'text =~ x4 + x5 + x6'
#Analyze the model
text.fit <- cfa(model=text.model, data=HolzingerSwineford1939)
#Summarize the model
summary(text.fit, standardized = TRUE, fit.measures = TRUE)
#Update the model specification by setting two paths to the label a
text.model <- 'text =~ x4 + a*x5 + a*x6'
#Analyze the model
text.fit <- cfa(model = text.model, data = HolzingerSwineford1939)
#Summarize the model
summary(text.fit, standardized = TRUE, fit.measures = TRUE)
#Create a two-factor model of text and speed variables
twofactor.model <- 'text =~ x4 + x5 + x6
speed =~ x7 + x8 + x9'
#Previous one-factor model output
summary(text.fit, standardized = TRUE, fit.measures = TRUE)
#Two-factor model specification
twofactor.model <- 'text =~ x4 + x5 + x6
speed =~ x7 + x8 + x9'
#Use cfa() to analyze the model
twofactor.fit <- cfa(model=twofactor.model, data=HolzingerSwineford1939)
#Use summary() to view the fitted model
summary(twofactor.fit, standardized = TRUE, fit.measures = TRUE)
#Load the library and data
data(epi, package="psych")
#Specify a three-factor model with one correlation set to zero
epi.model <- 'extraversion =~ V1 + V3 + V5 + V8
neuroticism =~ V2 + V4 + V7 + V9
lying =~ V6 + V12 + V18 + V24
extraversion ~~ 0*neuroticism'
#Run the model
epi.fit <- cfa(model = epi.model, data = epi)
#Examine the output
summary(epi.fit, standardized = TRUE, fit.measures = TRUE)
#Specify a three-factor model where lying is predicted by neuroticism
epi.model <- 'extraversion =~ V1 + V3 + V5 + V8
neuroticism =~ V2 + V4 + V7 + V9
lying =~ V6 + V12 + V18 + V24
lying ~ neuroticism'
#Run the model
epi.fit <- cfa(model = epi.model, data = epi)
#Examine the output
summary(epi.fit, standardized = TRUE, fit.measures = TRUE)
#Calculate the variance of V1
var(epi$V1, na.rm=TRUE)
#Examine the modification indices
modificationindices(epi.fit, sort=TRUE)
#Edit the model specification
epi.model1 <- 'extraversion =~ V1 + V3 + V5 + V8
neuroticism =~ V2 + V4 + V7 + V9
lying =~ V6 + V12 + V18 + V24
neuroticism =~ V3'
#Reanalyze the model
epi.fit1 <- cfa(model = epi.model1, data = epi)
#Summarize the updated model
summary(epi.fit1, standardized = TRUE, fit.measures = TRUE)
#Analyze the original model
epi.fit <- cfa(model = epi.model, data = epi)
#Analyze the updated model
epi.fit1 <- cfa(model = epi.model1, data = epi)
#Compare those models
anova(epi.fit, epi.fit1)
#Analyze the original model
epi.fit <- cfa(model = epi.model, data = epi)
#Find the fit indices for the original model
fitmeasures(epi.fit)[c("aic", "ecvi")]
#Analyze the updated model
epi.fit1 <- cfa(model = epi.model1, data = epi)
#Find the fit indices for the updated model
fitmeasures(epi.fit1)[c("aic", "ecvi")]
Chapter 3 - Troubleshooting Model Errors and Diagrams
Heywood Cases on the Latent Variable:
Heywood Cases on the Manifest Variable (negative error variances):
Create Diagrams with semPaths():
Example code includes:
badlatentdata <- readr::read_csv("./RInputFiles/badlatentdata.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## V1 = col_double(),
## V2 = col_double(),
## V3 = col_double(),
## V4 = col_double(),
## V5 = col_double(),
## V6 = col_double()
## )
badvardata <- readr::read_csv("./RInputFiles/badvardata.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## V1 = col_double(),
## V2 = col_double(),
## V3 = col_double(),
## V4 = col_double(),
## V5 = col_double(),
## V6 = col_double()
## )
adoptsurvey <- badlatentdata %>%
select(-X1) %>%
rename(pictures=V1, background=V2, loveskids=V3, energy=V4, wagstail=V5, playful=V6)
#Look at the data
str(adoptsurvey, give.attr=FALSE)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 6 variables:
## $ pictures : num 3.66318 -0.00508 2.99697 -0.90249 4.54211 ...
## $ background: num 3.07 7.7 1.51 3.03 7.22 ...
## $ loveskids : num 10.31 3.06 6.61 1.54 3.38 ...
## $ energy : num 3.68 2.42 3.51 -3.04 12.93 ...
## $ wagstail : num 5.26 7.05 4.25 2.17 6.23 ...
## $ playful : num 8.275 11.727 0.675 2.457 13.43 ...
head(adoptsurvey)
## # A tibble: 6 x 6
## pictures background loveskids energy wagstail playful
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.66 3.07 10.3 3.68 5.26 8.28
## 2 -0.00508 7.70 3.06 2.42 7.05 11.7
## 3 3.00 1.51 6.61 3.51 4.25 0.675
## 4 -0.902 3.03 1.54 -3.04 2.17 2.46
## 5 4.54 7.22 3.38 12.9 6.23 13.4
## 6 0.0257 -4.35 -1.95 -6.07 3.13 5.60
#Build the model
adopt.model <- 'goodstory =~ pictures + background + loveskids
inperson =~ energy + wagstail + playful'
#Analyze the model
adopt.fit <- cfa(model = adopt.model, data = adoptsurvey)
## Warning in lav_object_post_check(object): lavaan WARNING: covariance matrix of latent variables
## is not positive definite;
## use lavInspect(fit, "cov.lv") to investigate.
lavInspect(adopt.fit, "cov.lv")
## gdstry inprsn
## goodstory 0.397
## inperson 4.780 4.505
summary(adopt.fit, standardized=TRUE, fit.measures=TRUE)
## lavaan 0.6-5 ended normally after 61 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of free parameters 13
##
## Number of observations 100
##
## Model Test User Model:
##
## Test statistic 15.674
## Degrees of freedom 8
## P-value (Chi-square) 0.047
##
## Model Test Baseline Model:
##
## Test statistic 74.694
## Degrees of freedom 15
## P-value 0.000
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 0.871
## Tucker-Lewis Index (TLI) 0.759
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -1666.548
## Loglikelihood unrestricted model (H1) -1658.711
##
## Akaike (AIC) 3359.096
## Bayesian (BIC) 3392.963
## Sample-size adjusted Bayesian (BIC) 3351.906
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.098
## 90 Percent confidence interval - lower 0.010
## 90 Percent confidence interval - upper 0.170
## P-value RMSEA <= 0.05 0.126
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.080
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## goodstory =~
## pictures 1.000 0.630 0.200
## background 1.089 0.347 3.135 0.002 0.686 0.168
## loveskids 0.041 0.259 0.158 0.874 0.026 0.006
## inperson =~
## energy 1.000 2.122 0.538
## wagstail 1.134 0.300 3.780 0.000 2.406 0.473
## playful 0.601 0.213 2.823 0.005 1.275 0.329
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## goodstory ~~
## inperson 4.780 1.248 3.830 0.000 3.575 3.575
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .pictures 9.570 1.749 5.471 0.000 9.570 0.960
## .background 16.198 2.641 6.133 0.000 16.198 0.972
## .loveskids 21.675 3.065 7.071 0.000 21.675 1.000
## .energy 11.031 1.920 5.745 0.000 11.031 0.710
## .wagstail 20.085 3.199 6.278 0.000 20.085 0.776
## .playful 13.382 1.955 6.845 0.000 13.382 0.892
## goodstory 0.397 1.176 0.338 0.736 1.000 1.000
## inperson 4.505 1.910 2.359 0.018 1.000 1.000
#Edit the original model
adopt.model <- 'goodstory =~ pictures + background + loveskids + energy + wagstail + playful'
#Analyze the model
adopt.fit <- cfa(model = adopt.model, data = adoptsurvey)
#Look for Heywood cases
summary(adopt.fit, standardized = TRUE, fit.measures = TRUE)
## lavaan 0.6-5 ended normally after 49 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of free parameters 12
##
## Number of observations 100
##
## Model Test User Model:
##
## Test statistic 27.071
## Degrees of freedom 9
## P-value (Chi-square) 0.001
##
## Model Test Baseline Model:
##
## Test statistic 74.694
## Degrees of freedom 15
## P-value 0.000
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 0.697
## Tucker-Lewis Index (TLI) 0.495
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -1672.246
## Loglikelihood unrestricted model (H1) -1658.711
##
## Akaike (AIC) 3368.493
## Bayesian (BIC) 3399.755
## Sample-size adjusted Bayesian (BIC) 3361.856
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.142
## 90 Percent confidence interval - lower 0.082
## 90 Percent confidence interval - upper 0.205
## P-value RMSEA <= 0.05 0.009
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.086
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## goodstory =~
## pictures 1.000 1.773 0.562
## background 0.892 0.337 2.650 0.008 1.581 0.387
## loveskids 0.547 0.344 1.587 0.112 0.969 0.208
## energy 1.194 0.372 3.214 0.001 2.118 0.537
## wagstail 1.712 0.517 3.310 0.001 3.035 0.597
## playful 0.773 0.312 2.480 0.013 1.371 0.354
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .pictures 6.824 1.323 5.160 0.000 6.824 0.685
## .background 14.168 2.228 6.359 0.000 14.168 0.850
## .loveskids 20.736 3.009 6.891 0.000 20.736 0.957
## .energy 11.051 2.049 5.394 0.000 11.051 0.711
## .wagstail 16.661 3.486 4.779 0.000 16.661 0.644
## .playful 13.128 2.021 6.496 0.000 13.128 0.875
## goodstory 3.143 1.369 2.296 0.022 1.000 1.000
adoptsurvey <- badvardata %>%
select(-X1) %>%
rename(pictures=V1, background=V2, loveskids=V3, energy=V4, wagstail=V5, playful=V6)
str(adoptsurvey, give.attr=FALSE)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 6 variables:
## $ pictures : num 3.71 1.24 1.19 -1.26 4.58 ...
## $ background: num -0.964 6.38 -4.329 5.196 -0.145 ...
## $ loveskids : num 3.86 5.95 8.23 2.46 9.53 ...
## $ energy : num -6.73 1.61 4.09 7.6 -3.13 ...
## $ wagstail : num -1.199 0.532 4.59 3.699 2.546 ...
## $ playful : num 4.1 1.93 4.04 4.56 3.43 ...
summary(adoptsurvey)
## pictures background loveskids energy
## Min. :-4.5482 Min. :-4.83064 Min. :-7.162 Min. :-6.729
## 1st Qu.:-0.5815 1st Qu.:-0.03834 1st Qu.: 1.751 1st Qu.: 0.386
## Median : 1.7705 Median : 3.33339 Median : 4.585 Median : 2.081
## Mean : 1.7912 Mean : 3.04096 Mean : 5.178 Mean : 2.362
## 3rd Qu.: 3.6228 3rd Qu.: 5.57599 3rd Qu.: 8.709 3rd Qu.: 4.906
## Max. : 9.4674 Max. :14.81218 Max. :18.237 Max. :12.091
## wagstail playful
## Min. :-9.945 Min. :-4.913
## 1st Qu.:-1.226 1st Qu.: 1.811
## Median : 2.160 Median : 3.916
## Mean : 2.346 Mean : 3.711
## 3rd Qu.: 5.242 3rd Qu.: 5.751
## Max. :19.811 Max. :11.446
#Build the model
adopt.model <- 'goodstory =~ pictures + background + loveskids
inperson =~ energy + wagstail + playful'
#Analyze the model
adopt.fit <- cfa(model=adopt.model, data=adoptsurvey)
## Warning in lav_object_post_check(object): lavaan WARNING: some estimated ov
## variances are negative
#Summarize the model to view the negative variances
summary(adopt.fit, standardized=TRUE, fit.measures=TRUE, rsquare=TRUE)
## lavaan 0.6-5 ended normally after 303 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of free parameters 13
##
## Number of observations 100
##
## Model Test User Model:
##
## Test statistic 7.134
## Degrees of freedom 8
## P-value (Chi-square) 0.522
##
## Model Test Baseline Model:
##
## Test statistic 25.380
## Degrees of freedom 15
## P-value 0.045
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 1.000
## Tucker-Lewis Index (TLI) 1.156
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -1649.956
## Loglikelihood unrestricted model (H1) -1646.389
##
## Akaike (AIC) 3325.912
## Bayesian (BIC) 3359.779
## Sample-size adjusted Bayesian (BIC) 3318.722
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.000
## 90 Percent confidence interval - lower 0.000
## 90 Percent confidence interval - upper 0.109
## P-value RMSEA <= 0.05 0.686
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.050
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## goodstory =~
## pictures 1.000 1.360 0.437
## background 1.471 0.763 1.928 0.054 2.000 0.521
## loveskids 1.746 0.892 1.958 0.050 2.375 0.501
## inperson =~
## energy 1.000 0.208 0.058
## wagstail 45.262 1090.143 0.042 0.967 9.409 1.969
## playful 0.869 1.110 0.783 0.434 0.181 0.054
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## goodstory ~~
## inperson -0.014 0.332 -0.041 0.967 -0.048 -0.048
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .pictures 7.814 1.514 5.162 0.000 7.814 0.809
## .background 10.762 2.695 3.993 0.000 10.762 0.729
## .loveskids 16.791 3.936 4.266 0.000 16.791 0.749
## .energy 12.642 2.066 6.119 0.000 12.642 0.997
## .wagstail -65.677 2124.215 -0.031 0.975 -65.677 -2.875
## .playful 11.148 1.760 6.335 0.000 11.148 0.997
## goodstory 1.850 1.310 1.411 0.158 1.000 1.000
## inperson 0.043 1.046 0.041 0.967 1.000 1.000
##
## R-Square:
## Estimate
## pictures 0.191
## background 0.271
## loveskids 0.251
## energy 0.003
## wagstail NA
## playful 0.003
#View the variance of the problem manifest variable
var(adoptsurvey$wagstail)
## [1] 23.07446
#Update the model using 5 decimal places
adopt.model2 <- 'goodstory =~ pictures + background + loveskids
inperson =~ energy + wagstail + playful
wagstail~~23.07446*wagstail'
#Analyze and summarize the updated model
adopt.fit2 <- cfa(model = adopt.model2, data = adoptsurvey)
summary(adopt.fit2, standardized=TRUE, fit.measures=TRUE, rsquare=TRUE)
## lavaan 0.6-5 ended normally after 69 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of free parameters 12
##
## Number of observations 100
##
## Model Test User Model:
##
## Test statistic 8.493
## Degrees of freedom 9
## P-value (Chi-square) 0.485
##
## Model Test Baseline Model:
##
## Test statistic 25.380
## Degrees of freedom 15
## P-value 0.045
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 1.000
## Tucker-Lewis Index (TLI) 1.081
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -1650.635
## Loglikelihood unrestricted model (H1) -1646.389
##
## Akaike (AIC) 3325.270
## Bayesian (BIC) 3356.532
## Sample-size adjusted Bayesian (BIC) 3318.633
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.000
## 90 Percent confidence interval - lower 0.000
## 90 Percent confidence interval - upper 0.108
## P-value RMSEA <= 0.05 0.664
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.058
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## goodstory =~
## pictures 1.000 1.344 0.432
## background 1.461 0.758 1.928 0.054 1.964 0.511
## loveskids 1.818 0.947 1.919 0.055 2.444 0.516
## inperson =~
## energy 1.000 0.959 0.269
## wagstail 1.391 2.244 0.620 0.535 1.334 0.268
## playful 0.807 1.640 0.492 0.623 0.774 0.231
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## goodstory ~~
## inperson -0.077 0.450 -0.172 0.863 -0.060 -0.060
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .wagstail 23.074 23.074 0.928
## .pictures 7.857 1.510 5.203 0.000 7.857 0.813
## .background 10.906 2.672 4.082 0.000 10.906 0.739
## .loveskids 16.461 4.103 4.012 0.000 16.461 0.734
## .energy 11.765 2.683 4.385 0.000 11.765 0.928
## .playful 10.582 2.082 5.084 0.000 10.582 0.946
## goodstory 1.807 1.296 1.395 0.163 1.000 1.000
## inperson 0.920 2.209 0.416 0.677 1.000 1.000
##
## R-Square:
## Estimate
## wagstail 0.072
## pictures 0.187
## background 0.261
## loveskids 0.266
## energy 0.072
## playful 0.054
#Create a default picture
semPlot::semPaths(adopt.fit)
## Registered S3 methods overwritten by 'huge':
## method from
## plot.sim BDgraph
## print.sim BDgraph
#Update the default picture
semPlot::semPaths(object = adopt.fit, layout="tree", rotation=2)
#Update the default picture
semPlot::semPaths(object = adopt.fit, layout = "tree", rotation = 2, whatLabels = "std",
edge.label.cex = 1, what = "std", edge.color = "blue"
)
Chapter 4 - Full Example and Extension
Model WAIS-III IQ Scale:
Update WAIS-III Model:
Hierarchical Model of IQ:
Wrap Up:
Example code includes:
IQdata <- readr::read_csv("./RInputFiles/IQdata.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## inform = col_double(),
## simil = col_double(),
## vocab = col_double(),
## compreh = col_double(),
## digspan = col_double(),
## arith = col_double(),
## piccomp = col_double(),
## block = col_double(),
## matrixreason = col_double(),
## symbolsearch = col_double(),
## digsym = col_double(),
## lnseq = col_double()
## )
glimpse(IQdata)
## Observations: 300
## Variables: 13
## $ X1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ inform <dbl> 31, 15, 13, 13, 22, 25, 20, 18, 21, 22, 16, 23, 17, 19...
## $ simil <dbl> 23, 20, 22, 21, 21, 22, 25, 25, 22, 25, 25, 24, 18, 23...
## $ vocab <dbl> 63, 44, 40, 51, 55, 61, 45, 61, 57, 56, 62, 53, 44, 48...
## $ compreh <dbl> 27, 21, 28, 21, 28, 27, 23, 28, 27, 22, 28, 30, 27, 27...
## $ digspan <dbl> 20, 13, 14, 22, 17, 20, 13, 22, 14, 15, 15, 26, 15, 20...
## $ arith <dbl> 18, 12, 13, 13, 10, 20, 16, 14, 16, 10, 13, 21, 11, 13...
## $ piccomp <dbl> 18, 13, 13, 16, 13, 18, 16, 22, 16, 16, 20, 19, 15, 15...
## $ block <dbl> 50, 29, 28, 36, 22, 59, 33, 43, 40, 31, 35, 59, 45, 58...
## $ matrixreason <dbl> 21, 17, 16, 14, 13, 18, 14, 18, 13, 13, 21, 16, 11, 15...
## $ symbolsearch <dbl> 38, 24, 25, 27, 27, 38, 31, 42, 34, 29, 37, 40, 33, 36...
## $ digsym <dbl> 57, 56, 72, 67, 60, 78, 60, 45, 40, 57, 63, 87, 73, 54...
## $ lnseq <dbl> 15, 12, 13, 18, 15, 16, 12, 30, 19, 16, 23, 16, 12, 21...
IQdata <- IQdata %>%
select(-X1)
glimpse(IQdata)
## Observations: 300
## Variables: 12
## $ inform <dbl> 31, 15, 13, 13, 22, 25, 20, 18, 21, 22, 16, 23, 17, 19...
## $ simil <dbl> 23, 20, 22, 21, 21, 22, 25, 25, 22, 25, 25, 24, 18, 23...
## $ vocab <dbl> 63, 44, 40, 51, 55, 61, 45, 61, 57, 56, 62, 53, 44, 48...
## $ compreh <dbl> 27, 21, 28, 21, 28, 27, 23, 28, 27, 22, 28, 30, 27, 27...
## $ digspan <dbl> 20, 13, 14, 22, 17, 20, 13, 22, 14, 15, 15, 26, 15, 20...
## $ arith <dbl> 18, 12, 13, 13, 10, 20, 16, 14, 16, 10, 13, 21, 11, 13...
## $ piccomp <dbl> 18, 13, 13, 16, 13, 18, 16, 22, 16, 16, 20, 19, 15, 15...
## $ block <dbl> 50, 29, 28, 36, 22, 59, 33, 43, 40, 31, 35, 59, 45, 58...
## $ matrixreason <dbl> 21, 17, 16, 14, 13, 18, 14, 18, 13, 13, 21, 16, 11, 15...
## $ symbolsearch <dbl> 38, 24, 25, 27, 27, 38, 31, 42, 34, 29, 37, 40, 33, 36...
## $ digsym <dbl> 57, 56, 72, 67, 60, 78, 60, 45, 40, 57, 63, 87, 73, 54...
## $ lnseq <dbl> 15, 12, 13, 18, 15, 16, 12, 30, 19, 16, 23, 16, 12, 21...
#Build a four-factor model
wais.model <- 'verbalcomp =~ vocab + simil + inform + compreh
workingmemory =~ arith + digspan + lnseq
perceptorg =~ piccomp + block + matrixreason
processing =~ digsym + symbolsearch'
#Analyze the model
wais.fit <- cfa(model=wais.model, data=IQdata)
## Warning in lav_object_post_check(object): lavaan WARNING: covariance matrix of latent variables
## is not positive definite;
## use lavInspect(fit, "cov.lv") to investigate.
#Summarize the model
summary(wais.fit, standardized=TRUE, fit.measures=TRUE, rsquare=TRUE)
## lavaan 0.6-5 ended normally after 153 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of free parameters 30
##
## Number of observations 300
##
## Model Test User Model:
##
## Test statistic 233.268
## Degrees of freedom 48
## P-value (Chi-square) 0.000
##
## Model Test Baseline Model:
##
## Test statistic 1042.916
## Degrees of freedom 66
## P-value 0.000
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 0.810
## Tucker-Lewis Index (TLI) 0.739
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -9939.800
## Loglikelihood unrestricted model (H1) -9823.166
##
## Akaike (AIC) 19939.599
## Bayesian (BIC) 20050.713
## Sample-size adjusted Bayesian (BIC) 19955.570
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.113
## 90 Percent confidence interval - lower 0.099
## 90 Percent confidence interval - upper 0.128
## P-value RMSEA <= 0.05 0.000
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.073
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## verbalcomp =~
## vocab 1.000 6.282 0.879
## simil 0.296 0.031 9.470 0.000 1.859 0.581
## inform 0.450 0.043 10.483 0.000 2.825 0.645
## compreh 0.315 0.035 8.986 0.000 1.979 0.551
## workingmemory =~
## arith 1.000 2.530 0.845
## digspan 0.875 0.137 6.373 0.000 2.213 0.561
## lnseq 0.225 0.106 2.130 0.033 0.570 0.142
## perceptorg =~
## piccomp 1.000 1.391 0.596
## block 3.988 0.421 9.477 0.000 5.546 0.719
## matrixreason 0.909 0.127 7.171 0.000 1.264 0.494
## processing =~
## digsym 1.000 2.809 0.239
## symbolsearch 1.065 0.300 3.547 0.000 2.990 0.724
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## verbalcomp ~~
## workingmemory 6.120 1.232 4.969 0.000 0.385 0.385
## perceptorg 5.644 0.868 6.503 0.000 0.646 0.646
## processing 10.050 3.150 3.190 0.001 0.570 0.570
## workingmemory ~~
## perceptorg 2.437 0.371 6.561 0.000 0.693 0.693
## processing 2.701 0.984 2.745 0.006 0.380 0.380
## perceptorg ~~
## processing 4.027 1.200 3.356 0.001 1.031 1.031
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .vocab 11.573 2.656 4.357 0.000 11.573 0.227
## .simil 6.792 0.620 10.951 0.000 6.792 0.663
## .inform 11.201 1.084 10.330 0.000 11.201 0.584
## .compreh 8.969 0.804 11.157 0.000 8.969 0.696
## .arith 2.560 0.901 2.842 0.004 2.560 0.286
## .digspan 10.653 1.102 9.666 0.000 10.653 0.685
## .lnseq 15.750 1.294 12.173 0.000 15.750 0.980
## .piccomp 3.505 0.323 10.851 0.000 3.505 0.644
## .block 28.761 3.207 8.968 0.000 28.761 0.483
## .matrixreason 4.957 0.431 11.509 0.000 4.957 0.756
## .digsym 130.314 10.847 12.014 0.000 130.314 0.943
## .symbolsearch 8.127 2.480 3.277 0.001 8.127 0.476
## verbalcomp 39.459 4.757 8.294 0.000 1.000 1.000
## workingmemory 6.399 1.122 5.703 0.000 1.000 1.000
## perceptorg 1.934 0.371 5.211 0.000 1.000 1.000
## processing 7.889 4.309 1.831 0.067 1.000 1.000
##
## R-Square:
## Estimate
## vocab 0.773
## simil 0.337
## inform 0.416
## compreh 0.304
## arith 0.714
## digspan 0.315
## lnseq 0.020
## piccomp 0.356
## block 0.517
## matrixreason 0.244
## digsym 0.057
## symbolsearch 0.524
#Edit the original model
wais.model <- 'verbalcomp =~ vocab + simil + inform + compreh
workingmemory =~ arith + digspan + lnseq
perceptorg =~ piccomp + block + matrixreason + digsym + symbolsearch'
#Analyze the model
wais.fit <- cfa(model=wais.model, data=IQdata)
#Summarize the model
summary(wais.fit, standardized=TRUE, fit.measures=TRUE, rsquare=TRUE)
## lavaan 0.6-5 ended normally after 110 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of free parameters 27
##
## Number of observations 300
##
## Model Test User Model:
##
## Test statistic 252.809
## Degrees of freedom 51
## P-value (Chi-square) 0.000
##
## Model Test Baseline Model:
##
## Test statistic 1042.916
## Degrees of freedom 66
## P-value 0.000
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 0.793
## Tucker-Lewis Index (TLI) 0.733
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -9949.570
## Loglikelihood unrestricted model (H1) -9823.166
##
## Akaike (AIC) 19953.141
## Bayesian (BIC) 20053.143
## Sample-size adjusted Bayesian (BIC) 19967.515
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.115
## 90 Percent confidence interval - lower 0.101
## 90 Percent confidence interval - upper 0.129
## P-value RMSEA <= 0.05 0.000
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.076
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## verbalcomp =~
## vocab 1.000 6.281 0.879
## simil 0.296 0.031 9.483 0.000 1.861 0.581
## inform 0.449 0.043 10.481 0.000 2.822 0.644
## compreh 0.315 0.035 8.999 0.000 1.981 0.552
## workingmemory =~
## arith 1.000 2.528 0.844
## digspan 0.881 0.152 5.786 0.000 2.227 0.565
## lnseq 0.205 0.107 1.920 0.055 0.518 0.129
## perceptorg =~
## piccomp 1.000 1.517 0.650
## block 3.739 0.390 9.583 0.000 5.672 0.735
## matrixreason 0.832 0.117 7.099 0.000 1.262 0.493
## digsym 1.603 0.507 3.160 0.002 2.431 0.207
## symbolsearch 1.880 0.204 9.236 0.000 2.852 0.690
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## verbalcomp ~~
## workingmemory 6.132 1.234 4.970 0.000 0.386 0.386
## perceptorg 5.892 0.886 6.647 0.000 0.618 0.618
## workingmemory ~~
## perceptorg 2.227 0.362 6.149 0.000 0.581 0.581
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .vocab 11.577 2.651 4.367 0.000 11.577 0.227
## .simil 6.787 0.620 10.950 0.000 6.787 0.662
## .inform 11.218 1.085 10.342 0.000 11.218 0.585
## .compreh 8.962 0.803 11.155 0.000 8.962 0.696
## .arith 2.571 1.014 2.535 0.011 2.571 0.287
## .digspan 10.590 1.161 9.121 0.000 10.590 0.681
## .lnseq 15.807 1.297 12.183 0.000 15.807 0.983
## .piccomp 3.138 0.317 9.913 0.000 3.138 0.577
## .block 27.343 3.226 8.476 0.000 27.343 0.459
## .matrixreason 4.960 0.441 11.243 0.000 4.960 0.757
## .digsym 132.291 10.925 12.109 0.000 132.291 0.957
## .symbolsearch 8.936 0.957 9.333 0.000 8.936 0.524
## verbalcomp 39.455 4.754 8.299 0.000 1.000 1.000
## workingmemory 6.388 1.215 5.259 0.000 1.000 1.000
## perceptorg 2.301 0.408 5.646 0.000 1.000 1.000
##
## R-Square:
## Estimate
## vocab 0.773
## simil 0.338
## inform 0.415
## compreh 0.304
## arith 0.713
## digspan 0.319
## lnseq 0.017
## piccomp 0.423
## block 0.541
## matrixreason 0.243
## digsym 0.043
## symbolsearch 0.476
#Update the default picture
semPlot::semPaths(object = wais.fit, layout = "tree", rotation = 1, whatLabels = "std",
edge.label.cex = 1, what = "std", edge.color = "black"
)
#Examine modification indices
modificationindices(wais.fit, sort = TRUE)
## lhs op rhs mi epc sepc.lv sepc.all sepc.nox
## 66 simil ~~ inform 35.879 -3.757 -3.757 -0.431 -0.431
## 56 vocab ~~ inform 28.377 9.783 9.783 0.858 0.858
## 48 perceptorg =~ vocab 21.865 -2.077 -3.151 -0.441 -0.441
## 115 block ~~ matrixreason 16.209 -3.622 -3.622 -0.311 -0.311
## 96 arith ~~ block 15.061 3.679 3.679 0.439 0.439
## 117 block ~~ symbolsearch 13.144 5.725 5.725 0.366 0.366
## 47 workingmemory =~ symbolsearch 12.272 -0.467 -1.181 -0.286 -0.286
## 81 inform ~~ block 12.269 4.358 4.358 0.249 0.249
## 64 vocab ~~ digsym 11.578 -11.261 -11.261 -0.288 -0.288
## 40 workingmemory =~ simil 11.383 0.278 0.703 0.220 0.220
## 72 simil ~~ block 10.605 -3.084 -3.084 -0.226 -0.226
## 45 workingmemory =~ matrixreason 9.685 0.267 0.675 0.264 0.264
## 95 arith ~~ piccomp 9.463 -0.892 -0.892 -0.314 -0.314
## 60 vocab ~~ lnseq 9.425 -3.486 -3.486 -0.258 -0.258
## 67 simil ~~ compreh 9.356 1.587 1.587 0.203 0.203
## 44 workingmemory =~ block 9.258 0.765 1.933 0.251 0.251
## 51 perceptorg =~ compreh 9.177 0.601 0.912 0.254 0.254
## 62 vocab ~~ block 8.712 -5.377 -5.377 -0.302 -0.302
## 73 simil ~~ matrixreason 8.672 1.065 1.065 0.184 0.184
## 106 lnseq ~~ piccomp 8.620 1.298 1.298 0.184 0.184
## 91 compreh ~~ digsym 8.155 5.908 5.908 0.172 0.172
## 59 vocab ~~ digspan 8.127 2.849 2.849 0.257 0.257
## 37 verbalcomp =~ digsym 7.803 -0.464 -2.917 -0.248 -0.248
## 68 simil ~~ arith 7.534 1.064 1.064 0.255 0.255
## 99 arith ~~ symbolsearch 7.468 -1.391 -1.391 -0.290 -0.290
## 57 vocab ~~ compreh 7.107 -3.508 -3.508 -0.344 -0.344
## 87 compreh ~~ lnseq 7.001 1.887 1.887 0.159 0.159
## 97 arith ~~ matrixreason 6.391 0.848 0.848 0.237 0.237
## 107 lnseq ~~ block 5.677 3.289 3.289 0.158 0.158
## 34 verbalcomp =~ piccomp 5.507 0.071 0.447 0.192 0.192
## 78 inform ~~ digspan 5.435 -1.649 -1.649 -0.151 -0.151
## 33 verbalcomp =~ lnseq 5.250 -0.104 -0.652 -0.163 -0.163
## 54 perceptorg =~ lnseq 4.644 0.512 0.777 0.194 0.194
## 39 workingmemory =~ vocab 4.638 -0.406 -1.025 -0.143 -0.143
## 102 digspan ~~ block 4.564 -2.689 -2.689 -0.158 -0.158
## 35 verbalcomp =~ block 4.551 -0.218 -1.371 -0.178 -0.178
## 88 compreh ~~ piccomp 4.455 0.728 0.728 0.137 0.137
## 112 piccomp ~~ matrixreason 4.306 0.568 0.568 0.144 0.144
## 101 digspan ~~ piccomp 4.218 0.808 0.808 0.140 0.140
## 46 workingmemory =~ digsym 4.139 -0.852 -2.152 -0.183 -0.183
## 71 simil ~~ piccomp 4.029 0.607 0.607 0.132 0.132
## 76 inform ~~ compreh 3.789 -1.367 -1.367 -0.136 -0.136
## 70 simil ~~ lnseq 3.693 -1.200 -1.200 -0.116 -0.116
## 50 perceptorg =~ inform 3.487 0.444 0.673 0.154 0.154
## 58 vocab ~~ arith 3.451 -1.457 -1.457 -0.267 -0.267
## 55 vocab ~~ simil 3.393 2.239 2.239 0.253 0.253
## 113 piccomp ~~ digsym 3.375 2.419 2.419 0.119 0.119
## 93 arith ~~ digspan 3.274 7.960 7.960 1.526 1.526
## 86 compreh ~~ digspan 3.234 -1.110 -1.110 -0.114 -0.114
## 80 inform ~~ piccomp 2.871 -0.672 -0.672 -0.113 -0.113
## 104 digspan ~~ digsym 2.754 -3.822 -3.822 -0.102 -0.102
## 114 piccomp ~~ symbolsearch 2.677 -0.731 -0.731 -0.138 -0.138
## 89 compreh ~~ block 2.551 1.725 1.725 0.110 0.110
## 90 compreh ~~ matrixreason 2.342 -0.632 -0.632 -0.095 -0.095
## 74 simil ~~ digsym 2.021 -2.575 -2.575 -0.086 -0.086
## 43 workingmemory =~ piccomp 1.899 -0.104 -0.262 -0.113 -0.113
## 49 perceptorg =~ simil 1.675 0.227 0.345 0.108 0.108
## 92 compreh ~~ symbolsearch 1.646 0.764 0.764 0.085 0.085
## 111 piccomp ~~ block 1.591 -1.084 -1.084 -0.117 -0.117
## 85 compreh ~~ arith 1.350 -0.514 -0.514 -0.107 -0.107
## 32 verbalcomp =~ digspan 1.224 0.058 0.365 0.092 0.092
## 79 inform ~~ lnseq 0.998 -0.815 -0.815 -0.061 -0.061
## 69 simil ~~ digspan 0.996 0.540 0.540 0.064 0.064
## 53 perceptorg =~ digspan 0.942 -0.710 -1.077 -0.273 -0.273
## 77 inform ~~ arith 0.890 0.480 0.480 0.089 0.089
## 116 block ~~ digsym 0.805 3.770 3.770 0.063 0.063
## 120 digsym ~~ symbolsearch 0.724 1.948 1.948 0.057 0.057
## 100 digspan ~~ lnseq 0.703 -0.688 -0.688 -0.053 -0.053
## 83 inform ~~ digsym 0.667 1.935 1.935 0.050 0.050
## 36 verbalcomp =~ matrixreason 0.543 0.025 0.159 0.062 0.062
## 61 vocab ~~ piccomp 0.529 0.414 0.414 0.069 0.069
## 105 digspan ~~ symbolsearch 0.481 -0.475 -0.475 -0.049 -0.049
## 52 perceptorg =~ arith 0.478 -0.694 -1.052 -0.352 -0.352
## 98 arith ~~ digsym 0.474 -1.135 -1.135 -0.062 -0.062
## 94 arith ~~ lnseq 0.430 -0.496 -0.496 -0.078 -0.078
## 31 verbalcomp =~ arith 0.237 -0.029 -0.182 -0.061 -0.061
## 103 digspan ~~ matrixreason 0.226 0.221 0.221 0.030 0.030
## 42 workingmemory =~ compreh 0.190 -0.041 -0.103 -0.029 -0.029
## 75 simil ~~ symbolsearch 0.188 -0.227 -0.227 -0.029 -0.029
## 63 vocab ~~ matrixreason 0.143 -0.253 -0.253 -0.033 -0.033
## 109 lnseq ~~ digsym 0.128 -0.951 -0.951 -0.021 -0.021
## 38 verbalcomp =~ symbolsearch 0.077 0.015 0.094 0.023 0.023
## 118 matrixreason ~~ digsym 0.060 -0.380 -0.380 -0.015 -0.015
## 41 workingmemory =~ inform 0.037 0.021 0.053 0.012 0.012
## 119 matrixreason ~~ symbolsearch 0.031 -0.085 -0.085 -0.013 -0.013
## 108 lnseq ~~ matrixreason 0.017 0.069 0.069 0.008 0.008
## 110 lnseq ~~ symbolsearch 0.009 0.072 0.072 0.006 0.006
## 65 vocab ~~ symbolsearch 0.005 -0.068 -0.068 -0.007 -0.007
## 84 inform ~~ symbolsearch 0.004 -0.045 -0.045 -0.004 -0.004
## 82 inform ~~ matrixreason 0.004 0.029 0.029 0.004 0.004
#Update the three-factor model
wais.model2 <- 'verbalcomp =~ vocab + simil + inform + compreh
workingmemory =~ arith + digspan + lnseq
perceptorg =~ piccomp + block + matrixreason + digsym + symbolsearch
simil ~~ inform'
#Analyze the three-factor model where data is IQdata
wais.fit2 <- cfa(model=wais.model2, data=IQdata)
#Summarize the three-factor model
summary(wais.fit2, standardized=TRUE, fit.measures=TRUE, rsquare=TRUE)
## lavaan 0.6-5 ended normally after 114 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of free parameters 28
##
## Number of observations 300
##
## Model Test User Model:
##
## Test statistic 212.813
## Degrees of freedom 50
## P-value (Chi-square) 0.000
##
## Model Test Baseline Model:
##
## Test statistic 1042.916
## Degrees of freedom 66
## P-value 0.000
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 0.833
## Tucker-Lewis Index (TLI) 0.780
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -9929.572
## Loglikelihood unrestricted model (H1) -9823.166
##
## Akaike (AIC) 19915.144
## Bayesian (BIC) 20018.850
## Sample-size adjusted Bayesian (BIC) 19930.051
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.104
## 90 Percent confidence interval - lower 0.090
## 90 Percent confidence interval - upper 0.119
## P-value RMSEA <= 0.05 0.000
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.071
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## verbalcomp =~
## vocab 1.000 5.888 0.824
## simil 0.361 0.035 10.184 0.000 2.125 0.664
## inform 0.525 0.048 10.857 0.000 3.090 0.706
## compreh 0.334 0.036 9.349 0.000 1.965 0.547
## workingmemory =~
## arith 1.000 2.565 0.857
## digspan 0.857 0.149 5.768 0.000 2.199 0.558
## lnseq 0.193 0.104 1.850 0.064 0.495 0.123
## perceptorg =~
## piccomp 1.000 1.515 0.650
## block 3.737 0.390 9.581 0.000 5.662 0.734
## matrixreason 0.843 0.118 7.176 0.000 1.278 0.499
## digsym 1.615 0.508 3.181 0.001 2.446 0.208
## symbolsearch 1.875 0.203 9.218 0.000 2.841 0.688
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .simil ~~
## .inform -3.738 0.606 -6.169 0.000 -3.738 -0.503
## verbalcomp ~~
## workingmemory 6.278 1.181 5.315 0.000 0.416 0.416
## perceptorg 5.654 0.859 6.583 0.000 0.634 0.634
## workingmemory ~~
## perceptorg 2.237 0.363 6.172 0.000 0.576 0.576
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .vocab 16.365 2.375 6.892 0.000 16.365 0.321
## .simil 5.734 0.610 9.399 0.000 5.734 0.560
## .inform 9.635 1.095 8.801 0.000 9.635 0.502
## .compreh 9.026 0.791 11.413 0.000 9.026 0.700
## .arith 2.380 1.037 2.294 0.022 2.380 0.266
## .digspan 10.715 1.154 9.282 0.000 10.715 0.689
## .lnseq 15.830 1.298 12.193 0.000 15.830 0.985
## .piccomp 3.143 0.316 9.937 0.000 3.143 0.578
## .block 27.457 3.220 8.527 0.000 27.457 0.461
## .matrixreason 4.921 0.439 11.216 0.000 4.921 0.751
## .digsym 132.218 10.920 12.108 0.000 132.218 0.957
## .symbolsearch 8.996 0.958 9.393 0.000 8.996 0.527
## verbalcomp 34.667 4.408 7.865 0.000 1.000 1.000
## workingmemory 6.579 1.239 5.309 0.000 1.000 1.000
## perceptorg 2.296 0.407 5.643 0.000 1.000 1.000
##
## R-Square:
## Estimate
## vocab 0.679
## simil 0.440
## inform 0.498
## compreh 0.300
## arith 0.734
## digspan 0.311
## lnseq 0.015
## piccomp 0.422
## block 0.539
## matrixreason 0.249
## digsym 0.043
## symbolsearch 0.473
#Compare the models
anova(wais.fit, wais.fit2)
## Chi-Squared Difference Test
##
## Df AIC BIC Chisq Chisq diff Df diff Pr(>Chisq)
## wais.fit2 50 19915 20019 212.81
## wais.fit 51 19953 20053 252.81 39.996 1 2.545e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#View the fit indices for the original model
fitmeasures(wais.fit, c("aic", "ecvi"))
## aic ecvi
## 19953.141 1.023
#View the fit indices for the updated model
fitmeasures(wais.fit2, c("aic", "ecvi"))
## aic ecvi
## 19915.144 0.896
#Update the three-factor model to a hierarchical model
wais.model3 <- 'verbalcomp =~ vocab + simil + inform + compreh
workingmemory =~ arith + digspan + lnseq
perceptorg =~ piccomp + block + matrixreason + digsym + symbolsearch
simil ~~ inform
general =~ verbalcomp + workingmemory + perceptorg'
#Analyze the hierarchical model where data is IQdata
wais.fit3 <- cfa(model = wais.model3, data = IQdata)
#Examine the fit indices for the old model
fitmeasures(wais.fit2, c("rmsea", "srmr"))
## rmsea srmr
## 0.104 0.071
#Examine the fit indices for the new model
fitmeasures(wais.fit3, c("rmsea", "srmr"))
## rmsea srmr
## 0.104 0.071
#Update the default picture
semPlot::semPaths(object = wais.fit3, layout = "tree", rotation = 1, whatLabels = "std",
edge.label.cex = 1, what = "std", edge.color = "navy"
)
Chapter 1 - Explore Data
Import data:
Know data:
Count data - broken video that provides some code snippets:
Example code includes:
# Read in "bakeoff.csv" as bakeoff
bakeoff <- readr::read_csv("./RInputFiles/bakeoff.csv")
## Parsed with column specification:
## cols(
## series = col_double(),
## episode = col_double(),
## baker = col_character(),
## signature = col_character(),
## technical = col_double(),
## showstopper = col_character(),
## result = col_character(),
## uk_airdate = col_date(format = ""),
## us_season = col_double(),
## us_airdate = col_date(format = "")
## )
# Print bakeoff
bakeoff
## # A tibble: 549 x 10
## series episode baker signature technical showstopper result uk_airdate
## <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr> <date>
## 1 1 1 Anne~ "Light J~ 2 "Chocolate~ IN 2010-08-17
## 2 1 1 David "Chocola~ 3 "Black For~ IN 2010-08-17
## 3 1 1 Edd "Caramel~ 1 <NA> IN 2010-08-17
## 4 1 1 Jasm~ "Fresh M~ NA <NA> IN 2010-08-17
## 5 1 1 Jona~ "Carrot ~ 9 "Three-lay~ IN 2010-08-17
## 6 1 1 Loui~ "Carrot ~ NA "Never Fai~ IN 2010-08-17
## 7 1 1 Mira~ "Triple ~ 8 "Three Tie~ IN 2010-08-17
## 8 1 1 Ruth "Lemon D~ NA "Classic C~ IN 2010-08-17
## 9 1 1 Lea "Cranber~ 10 "Chocolate~ OUT 2010-08-17
## 10 1 1 Mark "Sticky ~ NA "Heart-sha~ OUT 2010-08-17
## # ... with 539 more rows, and 2 more variables: us_season <dbl>,
## # us_airdate <date>
# Data set above is already OK - UNKNOWN are NA in CSV
# Filter rows where showstopper is UNKNOWN
bakeoff %>%
filter(showstopper == "UNKNOWN")
## # A tibble: 0 x 10
## # ... with 10 variables: series <dbl>, episode <dbl>, baker <chr>,
## # signature <chr>, technical <dbl>, showstopper <chr>, result <chr>,
## # uk_airdate <date>, us_season <dbl>, us_airdate <date>
# Edit to add list of missing values
bakeoff <- read_csv("./RInputFiles/bakeoff.csv", na = c("", "NA", "UNKNOWN"))
## Parsed with column specification:
## cols(
## series = col_double(),
## episode = col_double(),
## baker = col_character(),
## signature = col_character(),
## technical = col_double(),
## showstopper = col_character(),
## result = col_character(),
## uk_airdate = col_date(format = ""),
## us_season = col_double(),
## us_airdate = col_date(format = "")
## )
# Filter rows where showstopper is NA
bakeoff %>%
filter(is.na(showstopper))
## # A tibble: 21 x 10
## series episode baker signature technical showstopper result uk_airdate
## <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr> <date>
## 1 1 1 Edd "Caramel~ 1 <NA> IN 2010-08-17
## 2 1 1 Jasm~ "Fresh M~ NA <NA> IN 2010-08-17
## 3 1 6 Mira~ "Lemon C~ NA <NA> RUNNE~ 2010-09-21
## 4 2 1 Ian "Apple a~ 10 <NA> IN 2011-08-16
## 5 2 1 Jason "Lemon M~ 6 <NA> IN 2011-08-16
## 6 2 1 Urva~ "Cherry ~ 7 <NA> IN 2011-08-16
## 7 2 1 Yasm~ "Cardamo~ 5 <NA> IN 2011-08-16
## 8 2 1 Holly "Cherry ~ 1 <NA> SB 2011-08-16
## 9 2 2 Ben "Chorizo~ 1 <NA> IN 2011-08-23
## 10 2 2 Ian "Stilton~ 2 <NA> IN 2011-08-23
## # ... with 11 more rows, and 2 more variables: us_season <dbl>,
## # us_airdate <date>
# Edit to filter, group by, and skim
bakeoff %>%
filter(!is.na(us_season)) %>%
group_by(us_season) %>%
skimr::skim()
| Name | Piped data |
| Number of rows | 302 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| character | 4 |
| Date | 2 |
| numeric | 3 |
| ________________________ | |
| Group variables | us_season |
Variable type: character
| skim_variable | us_season | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|---|
| baker | 1 | 0 | 1.00 | 3 | 9 | 0 | 13 | 0 |
| baker | 2 | 0 | 1.00 | 4 | 7 | 0 | 12 | 0 |
| baker | 3 | 0 | 1.00 | 3 | 6 | 0 | 12 | 0 |
| baker | 4 | 0 | 1.00 | 3 | 9 | 0 | 12 | 0 |
| signature | 1 | 0 | 1.00 | 10 | 125 | 0 | 78 | 0 |
| signature | 2 | 1 | 0.99 | 15 | 107 | 0 | 73 | 0 |
| signature | 3 | 0 | 1.00 | 12 | 64 | 0 | 74 | 0 |
| signature | 4 | 0 | 1.00 | 12 | 93 | 0 | 75 | 0 |
| showstopper | 1 | 0 | 1.00 | 5 | 126 | 0 | 78 | 0 |
| showstopper | 2 | 1 | 0.99 | 8 | 82 | 0 | 73 | 0 |
| showstopper | 3 | 0 | 1.00 | 10 | 70 | 0 | 73 | 0 |
| showstopper | 4 | 0 | 1.00 | 5 | 86 | 0 | 74 | 0 |
| result | 1 | 0 | 1.00 | 2 | 9 | 0 | 5 | 0 |
| result | 2 | 0 | 1.00 | 2 | 9 | 0 | 6 | 0 |
| result | 3 | 0 | 1.00 | 2 | 9 | 0 | 5 | 0 |
| result | 4 | 0 | 1.00 | 2 | 9 | 0 | 5 | 0 |
Variable type: Date
| skim_variable | us_season | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|---|
| uk_airdate | 1 | 0 | 1 | 2013-08-20 | 2013-10-22 | 2013-09-10 | 10 |
| uk_airdate | 2 | 0 | 1 | 2014-08-06 | 2014-10-08 | 2014-08-27 | 10 |
| uk_airdate | 3 | 0 | 1 | 2015-08-05 | 2015-10-07 | 2015-08-26 | 10 |
| uk_airdate | 4 | 0 | 1 | 2016-08-24 | 2016-10-26 | 2016-09-14 | 10 |
| us_airdate | 1 | 0 | 1 | 2014-12-28 | 2015-03-01 | 2015-01-18 | 10 |
| us_airdate | 2 | 0 | 1 | 2015-09-06 | 2015-11-08 | 2015-09-27 | 10 |
| us_airdate | 3 | 0 | 1 | 2016-07-01 | 2016-08-12 | 2016-07-15 | 7 |
| us_airdate | 4 | 0 | 1 | 2017-06-16 | 2017-08-04 | 2017-06-30 | 8 |
Variable type: numeric
| skim_variable | us_season | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|---|
| series | 1 | 0 | 1.00 | 4.00 | 0.00 | 4 | 4.00 | 4.0 | 4 | 4 | ▁▁▇▁▁ |
| series | 2 | 0 | 1.00 | 5.00 | 0.00 | 5 | 5.00 | 5.0 | 5 | 5 | ▁▁▇▁▁ |
| series | 3 | 0 | 1.00 | 6.00 | 0.00 | 6 | 6.00 | 6.0 | 6 | 6 | ▁▁▇▁▁ |
| series | 4 | 0 | 1.00 | 7.00 | 0.00 | 7 | 7.00 | 7.0 | 7 | 7 | ▁▁▇▁▁ |
| episode | 1 | 0 | 1.00 | 4.31 | 2.66 | 1 | 2.00 | 4.0 | 6 | 10 | ▇▆▅▃▂ |
| episode | 2 | 0 | 1.00 | 4.38 | 2.68 | 1 | 2.00 | 4.0 | 6 | 10 | ▇▆▅▃▂ |
| episode | 3 | 0 | 1.00 | 4.40 | 2.67 | 1 | 2.00 | 4.0 | 6 | 10 | ▇▆▅▃▂ |
| episode | 4 | 0 | 1.00 | 4.40 | 2.67 | 1 | 2.00 | 4.0 | 6 | 10 | ▇▆▅▃▂ |
| technical | 1 | 0 | 1.00 | 5.08 | 3.19 | 1 | 2.25 | 4.5 | 7 | 13 | ▇▅▅▂▂ |
| technical | 2 | 1 | 0.99 | 4.73 | 2.93 | 1 | 2.00 | 4.0 | 7 | 12 | ▇▅▃▂▂ |
| technical | 3 | 0 | 1.00 | 4.80 | 2.92 | 1 | 2.00 | 4.0 | 7 | 12 | ▇▅▃▂▂ |
| technical | 4 | 0 | 1.00 | 4.80 | 2.92 | 1 | 2.00 | 4.0 | 7 | 12 | ▇▅▃▂▂ |
bakeoff %>%
distinct(result)
## # A tibble: 6 x 1
## result
## <chr>
## 1 IN
## 2 OUT
## 3 RUNNER UP
## 4 WINNER
## 5 SB
## 6 LEFT
# Count rows by distinct results
bakeoff %>%
count(result)
## # A tibble: 6 x 2
## result n
## <chr> <int>
## 1 IN 393
## 2 LEFT 1
## 3 OUT 70
## 4 RUNNER UP 16
## 5 SB 61
## 6 WINNER 8
# Count whether or not star baker
bakeoff %>%
count(result=="SB")
## # A tibble: 2 x 2
## `result == "SB"` n
## <lgl> <int>
## 1 FALSE 488
## 2 TRUE 61
# Count the number of rows by series and episode
bakeoff %>%
count(series, episode)
## # A tibble: 74 x 3
## series episode n
## <dbl> <dbl> <int>
## 1 1 1 10
## 2 1 2 8
## 3 1 3 6
## 4 1 4 5
## 5 1 5 4
## 6 1 6 3
## 7 2 1 12
## 8 2 2 11
## 9 2 3 10
## 10 2 4 8
## # ... with 64 more rows
# Add second count by series
bakeoff %>%
count(series, episode) %>%
count(series)
## # A tibble: 8 x 2
## series n
## <dbl> <int>
## 1 1 6
## 2 2 8
## 3 3 10
## 4 4 10
## 5 5 10
## 6 6 10
## 7 7 10
## 8 8 10
# Count the number of rows by series and baker
bakers_by_series <-
bakeoff %>%
count(series, baker)
# Print to view
bakers_by_series
## # A tibble: 95 x 3
## series baker n
## <dbl> <chr> <int>
## 1 1 Annetha 2
## 2 1 David 4
## 3 1 Edd 6
## 4 1 Jasminder 5
## 5 1 Jonathan 3
## 6 1 Lea 1
## 7 1 Louise 2
## 8 1 Mark 1
## 9 1 Miranda 6
## 10 1 Ruth 6
## # ... with 85 more rows
# Count again by series
bakers_by_series %>%
count(series)
## # A tibble: 8 x 2
## series n
## <dbl> <int>
## 1 1 10
## 2 2 12
## 3 3 12
## 4 4 13
## 5 5 12
## 6 6 12
## 7 7 12
## 8 8 12
# Count again by baker
bakers_by_series %>%
count(baker, sort=TRUE)
## # A tibble: 86 x 2
## baker n
## <chr> <int>
## 1 Kate 3
## 2 Ian 2
## 3 James 2
## 4 Louise 2
## 5 Mark 2
## 6 Peter 2
## 7 Robert 2
## 8 Tom 2
## 9 Ali 1
## 10 Alvin 1
## # ... with 76 more rows
ggplot(bakeoff, aes(x=episode)) +
geom_bar() +
facet_wrap(~series)
Chapter 2 - Tame Data
Cast column types:
Recode values:
0 = “other”, .default = “student”)) # 0 will become other, anything else will become student0 = NA_character_, .default = “student”)) # create NA for a specific stringSelect variables:
Tame variable names:
Example code includes:
# NOTE THAT THIS WILL THROW WARNINGS
# Try to cast technical as a number
desserts <- readr::read_csv("./RInputFiles/desserts.csv",
col_types = cols(
technical = col_number())
)
## Warning: 7 parsing failures.
## row col expected actual file
## 4 technical a number N/A './RInputFiles/desserts.csv'
## 6 technical a number N/A './RInputFiles/desserts.csv'
## 8 technical a number N/A './RInputFiles/desserts.csv'
## 10 technical a number N/A './RInputFiles/desserts.csv'
## 34 technical a number N/A './RInputFiles/desserts.csv'
## ... ......... ........ ...... ............................
## See problems(...) for more details.
# View parsing problems
readr::problems(desserts)
## # A tibble: 7 x 5
## row col expected actual file
## <int> <chr> <chr> <chr> <chr>
## 1 4 technical a number N/A './RInputFiles/desserts.csv'
## 2 6 technical a number N/A './RInputFiles/desserts.csv'
## 3 8 technical a number N/A './RInputFiles/desserts.csv'
## 4 10 technical a number N/A './RInputFiles/desserts.csv'
## 5 34 technical a number N/A './RInputFiles/desserts.csv'
## 6 35 technical a number N/A './RInputFiles/desserts.csv'
## 7 36 technical a number N/A './RInputFiles/desserts.csv'
# NOTE THAT THIS WILL FIX THE ERRORS
# Edit code to fix the parsing error
desserts <- readr::read_csv("./RInputFiles/desserts.csv",
col_types = cols(
technical = col_number()),
na = c("", "NA", "N/A")
)
# View parsing problems
readr::problems(desserts)
## [1] row col expected actual
## <0 rows> (or 0-length row.names)
# Find format to parse uk_airdate
readr::parse_date("17 August 2010", format = "%d %B %Y")
## [1] "2010-08-17"
# Edit to cast uk_airdate
desserts <- readr::read_csv("./RInputFiles/desserts.csv",
na = c("", "NA", "N/A"),
col_types = cols(
technical = col_number(),
uk_airdate = col_date("%d %B %Y")
))
# Print by descending uk_airdate
desserts %>%
arrange(desc(uk_airdate))
## # A tibble: 549 x 16
## series episode baker technical result uk_airdate us_season us_airdate
## <dbl> <dbl> <chr> <dbl> <chr> <date> <dbl> <date>
## 1 8 10 Kate 3 RUNNE~ 2017-10-31 NA NA
## 2 8 10 Stev~ 1 RUNNE~ 2017-10-31 NA NA
## 3 8 10 Soph~ 2 WINNER 2017-10-31 NA NA
## 4 8 9 Kate 4 IN 2017-10-24 NA NA
## 5 8 9 Stev~ 3 IN 2017-10-24 NA NA
## 6 8 9 Stac~ 2 OUT 2017-10-24 NA NA
## 7 8 9 Soph~ 1 SB 2017-10-24 NA NA
## 8 8 8 Kate 2 IN 2017-10-17 NA NA
## 9 8 8 Soph~ 4 IN 2017-10-17 NA NA
## 10 8 8 Stev~ 1 IN 2017-10-17 NA NA
## # ... with 539 more rows, and 8 more variables: showstopper_chocolate <chr>,
## # showstopper_dessert <chr>, showstopper_fruit <chr>, showstopper_nut <chr>,
## # signature_chocolate <chr>, signature_dessert <chr>, signature_fruit <chr>,
## # signature_nut <chr>
# Cast result a factor
desserts <- readr::read_csv("./RInputFiles/desserts.csv",
na = c("", "NA", "N/A"),
col_types = cols(
technical = col_number(),
uk_airdate = col_date(format = "%d %B %Y"),
result = col_factor(levels=NULL)
))
# Glimpse to view
glimpse(desserts)
## Observations: 549
## Variables: 16
## $ series <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ episode <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, ...
## $ baker <chr> "Annetha", "David", "Edd", "Jasminder", "Jona...
## $ technical <dbl> 2, 3, 1, NA, 9, NA, 8, NA, 10, NA, 8, 6, 2, 1...
## $ result <fct> IN, IN, IN, IN, IN, IN, IN, IN, OUT, OUT, IN,...
## $ uk_airdate <date> 2010-08-17, 2010-08-17, 2010-08-17, 2010-08-...
## $ us_season <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ us_airdate <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ showstopper_chocolate <chr> "chocolate", "chocolate", "no chocolate", "no...
## $ showstopper_dessert <chr> "other", "other", "other", "other", "other", ...
## $ showstopper_fruit <chr> "no fruit", "no fruit", "no fruit", "no fruit...
## $ showstopper_nut <chr> "no nut", "no nut", "no nut", "no nut", "almo...
## $ signature_chocolate <chr> "no chocolate", "chocolate", "no chocolate", ...
## $ signature_dessert <chr> "cake", "cake", "cake", "cake", "cake", "cake...
## $ signature_fruit <chr> "no fruit", "fruit", "fruit", "fruit", "fruit...
## $ signature_nut <chr> "no nut", "no nut", "no nut", "no nut", "no n...
oldDesserts <- desserts
tempDesserts <- desserts %>%
gather(key="type_ing", value="status", starts_with(c("showstopper")), starts_with(c("signature"))) %>%
separate(type_ing, into=c("challenge", "ingredient"), sep="_") %>%
spread(ingredient, status)
glimpse(tempDesserts)
## Observations: 1,098
## Variables: 13
## $ series <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ episode <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3,...
## $ baker <chr> "Annetha", "David", "Edd", "Jasminder", "Jonathan", "Lou...
## $ technical <dbl> 2, 3, 1, NA, 9, NA, 8, NA, 10, NA, 8, 6, 2, 1, 3, 5, 7, ...
## $ result <fct> IN, IN, IN, IN, IN, IN, IN, IN, OUT, OUT, IN, IN, IN, IN...
## $ uk_airdate <date> 2010-08-17, 2010-08-17, 2010-08-17, 2010-08-17, 2010-08...
## $ us_season <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ us_airdate <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ challenge <chr> "showstopper", "showstopper", "showstopper", "showstoppe...
## $ chocolate <chr> "chocolate", "chocolate", "no chocolate", "no chocolate"...
## $ dessert <chr> "other", "other", "other", "other", "other", "cake", "ca...
## $ fruit <chr> "no fruit", "no fruit", "no fruit", "no fruit", "fruit",...
## $ nut <chr> "no nut", "no nut", "no nut", "no nut", "almond", "no nu...
desserts <- tempDesserts
# Count rows grouping by nut variable
desserts %>%
count(nut, sort=TRUE)
## # A tibble: 8 x 2
## nut n
## <chr> <int>
## 1 no nut 944
## 2 almond 35
## 3 walnut 35
## 4 pistachio 29
## 5 filbert 23
## 6 pecan 14
## 7 multiple 9
## 8 peanut 9
# Recode filberts as hazelnuts
desserts <- desserts %>%
mutate(nut = recode(nut, "filbert" = "hazelnut"))
# Count rows again
desserts %>%
count(nut, sort = TRUE)
## # A tibble: 8 x 2
## nut n
## <chr> <int>
## 1 no nut 944
## 2 almond 35
## 3 walnut 35
## 4 pistachio 29
## 5 hazelnut 23
## 6 pecan 14
## 7 multiple 9
## 8 peanut 9
# Edit code to recode "no nut" as missing
desserts <- desserts %>%
mutate(nut = recode(nut, "filbert" = "hazelnut",
"no nut" = NA_character_))
# Count rows again
desserts %>%
count(nut, sort = TRUE)
## # A tibble: 8 x 2
## nut n
## <chr> <int>
## 1 <NA> 944
## 2 almond 35
## 3 walnut 35
## 4 pistachio 29
## 5 hazelnut 23
## 6 pecan 14
## 7 multiple 9
## 8 peanut 9
# Edit to recode tech_win as factor
desserts <- desserts %>%
mutate(tech_win = recode_factor(technical, `1` = 1,
.default = 0))
# Count to compare values
desserts %>%
count(technical == 1, tech_win)
## Warning: Factor `tech_win` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 3 x 3
## `technical == 1` tech_win n
## <lgl> <fct> <int>
## 1 FALSE 0 936
## 2 TRUE 1 146
## 3 NA <NA> 16
ratings0 <- readr::read_csv("./RInputFiles/02.03_messy_ratings.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## premiere = col_character(),
## finale = col_character(),
## winner = col_character(),
## day_of_week = col_character(),
## timeslot = col_time(format = ""),
## channel = col_character(),
## runner_up_1 = col_character(),
## runner_up_2 = col_character(),
## season_premiere = col_character(),
## season_finale = col_character(),
## e1_uk_airdate = col_character(),
## e2_uk_airdate = col_character(),
## e3_uk_airdate = col_character(),
## e4_uk_airdate = col_character(),
## e5_uk_airdate = col_character(),
## e6_uk_airdate = col_character(),
## e7_uk_airdate = col_character(),
## e8_uk_airdate = col_character(),
## e9_uk_airdate = col_character(),
## e10_uk_airdate = col_character()
## )
## See spec(...) for full column specifications.
str(ratings0, give.attr=FALSE)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 8 obs. of 44 variables:
## $ series : num 1 2 3 4 5 6 7 8
## $ episodes : num 6 8 10 10 10 10 10 10
## $ premiere : chr "17-Aug-10" "14-Aug-11" "14-Aug-12" "20-Aug-13" ...
## $ finale : chr "21-Sep-10" "4-Oct-11" "16-Oct-12" "22-Oct-13" ...
## $ winner : chr "Edd Kimber" "Joanne Wheatley" "John Whaite" "Frances Quinn" ...
## $ avg_uk_viewers : num 2.77 4 5 7.35 10.04 ...
## $ day_of_week : chr "Tuesday" "Tuesday" "Tuesday" "Tuesday" ...
## $ timeslot : 'hms' num 20:00:00 20:00:00 20:00:00 20:00:00 ...
## $ channel : chr "BBC Two" "BBC Two" "BBC Two" "BBC Two" ...
## $ runner_up_1 : chr "Miranda Gore Browne" "Holly Bell" "Brendan Lynch" "Kimberley Wilson" ...
## $ runner_up_2 : chr "Ruth Clemens" "Mary-Anne Boermans" "James Morton" "Ruby Tandoh" ...
## $ season : num NA NA NA 1 2 3 4 NA
## $ season_premiere : chr NA NA NA "12/28/14" ...
## $ season_finale : chr NA NA NA "3/1/15" ...
## $ e1_viewers_7day : num 2.24 3.1 3.85 6.6 8.51 ...
## $ e1_viewers_28day : num NA NA NA NA NA ...
## $ e2_viewers_7day : num 3 3.53 4.6 6.65 8.79 ...
## $ e2_viewers_28day : num NA NA NA NA NA ...
## $ e3_viewers_7day : num 3 3.82 4.53 7.17 9.28 ...
## $ e3_viewers_28day : num NA NA NA NA NA ...
## $ e4_viewers_7day : num 2.6 3.6 4.71 6.82 10.25 ...
## $ e4_viewers_28day : num NA NA NA NA NA ...
## $ e5_viewers_7day : num 3.03 3.83 4.61 6.95 9.95 ...
## $ e5_viewers_28day : num NA NA NA NA NA ...
## $ e6_viewers_7day : num 2.75 4.25 4.82 7.32 10.13 ...
## $ e6_viewers_28day : num NA NA NA NA NA ...
## $ e7_viewers_7day : num NA 4.42 5.1 7.76 10.28 ...
## $ e7_viewers_28day : num NA NA NA NA NA ...
## $ e8_viewers_7day : num NA 5.06 5.35 7.41 9.02 ...
## $ e8_viewers_28day : num NA NA NA NA NA ...
## $ e9_viewers_7day : num NA NA 5.7 7.41 10.67 ...
## $ e9_viewers_28day : num NA NA NA NA NA ...
## $ e10_viewers_7day : num NA NA 6.74 9.45 13.51 ...
## $ e10_viewers_28day: num NA NA NA NA NA ...
## $ e1_uk_airdate : chr "8/17/10" "8/16/11" "8/14/12" "8/20/13" ...
## $ e2_uk_airdate : chr "8/24/10" "8/23/11" "8/21/12" "8/27/13" ...
## $ e3_uk_airdate : chr "8/31/10" "8/30/11" "8/28/12" "9/3/13" ...
## $ e4_uk_airdate : chr "9/7/10" "9/6/11" "9/4/12" "9/10/13" ...
## $ e5_uk_airdate : chr "9/14/10" "9/13/11" "9/11/12" "9/17/13" ...
## $ e6_uk_airdate : chr "9/21/10" "9/20/11" "9/18/12" "9/24/13" ...
## $ e7_uk_airdate : chr NA "9/27/11" "9/25/12" "10/1/13" ...
## $ e8_uk_airdate : chr NA "10/4/11" "10/2/12" "10/8/13" ...
## $ e9_uk_airdate : chr NA NA "10/9/12" "10/15/13" ...
## $ e10_uk_airdate : chr NA NA "10/16/12" "10/22/13" ...
ratings <- ratings0 %>%
filter(series >= 3) %>%
rename(day=day_of_week) %>%
mutate(series=factor(series),
season_premiere=lubridate::mdy(season_premiere),
season_finale=lubridate::mdy(season_finale),
viewer_growth = (e10_viewers_7day - e1_viewers_7day)
) %>%
select(-contains("uk_airdate"))
# Recode channel as dummy: bbc (1) or not (0)
ratings <- ratings %>%
mutate(bbc = recode_factor(channel, "Channel 4"=0, .default=1))
# Look at the variables to plot next
ratings %>% select(series, channel, bbc, viewer_growth)
## # A tibble: 6 x 4
## series channel bbc viewer_growth
## <fct> <chr> <fct> <dbl>
## 1 3 BBC Two 1 2.89
## 2 4 BBC Two 1 2.85
## 3 5 BBC One 1 5
## 4 6 BBC One 1 3.43
## 5 7 BBC One 1 2.32
## 6 8 Channel 4 0 0.580
# Make a filled bar chart
ggplot(ratings, aes(x = series, y = viewer_growth, fill = bbc)) +
geom_col()
# Move channel to first column
ratings %>%
select(channel, everything())
## # A tibble: 6 x 36
## channel series episodes premiere finale winner avg_uk_viewers day timeslot
## <chr> <fct> <dbl> <chr> <chr> <chr> <dbl> <chr> <time>
## 1 BBC Two 3 10 14-Aug-~ 16-Oc~ John ~ 5 Tues~ 20:00
## 2 BBC Two 4 10 20-Aug-~ 22-Oc~ Franc~ 7.35 Tues~ 20:00
## 3 BBC One 5 10 6-Aug-14 8-Oct~ Nancy~ 10.0 Wedn~ 20:00
## 4 BBC One 6 10 5-Aug-15 7-Oct~ Nadiy~ 12.5 Wedn~ 20:00
## 5 BBC One 7 10 24-Aug-~ 26-Oc~ Candi~ 13.8 Wedn~ 20:00
## 6 Channe~ 8 10 29-Aug-~ 31-Oc~ Sophi~ 9.29 Tues~ 20:00
## # ... with 27 more variables: runner_up_1 <chr>, runner_up_2 <chr>,
## # season <dbl>, season_premiere <date>, season_finale <date>,
## # e1_viewers_7day <dbl>, e1_viewers_28day <dbl>, e2_viewers_7day <dbl>,
## # e2_viewers_28day <dbl>, e3_viewers_7day <dbl>, e3_viewers_28day <dbl>,
## # e4_viewers_7day <dbl>, e4_viewers_28day <dbl>, e5_viewers_7day <dbl>,
## # e5_viewers_28day <dbl>, e6_viewers_7day <dbl>, e6_viewers_28day <dbl>,
## # e7_viewers_7day <dbl>, e7_viewers_28day <dbl>, e8_viewers_7day <dbl>,
## # e8_viewers_28day <dbl>, e9_viewers_7day <dbl>, e9_viewers_28day <dbl>,
## # e10_viewers_7day <dbl>, e10_viewers_28day <dbl>, viewer_growth <dbl>,
## # bbc <fct>
# Edit to drop 7- and 28-day episode viewer data
ratings %>%
select(-ends_with("day"))
## # A tibble: 6 x 15
## series episodes premiere finale winner avg_uk_viewers timeslot channel
## <fct> <dbl> <chr> <chr> <chr> <dbl> <time> <chr>
## 1 3 10 14-Aug-~ 16-Oc~ John ~ 5 20:00 BBC Two
## 2 4 10 20-Aug-~ 22-Oc~ Franc~ 7.35 20:00 BBC Two
## 3 5 10 6-Aug-14 8-Oct~ Nancy~ 10.0 20:00 BBC One
## 4 6 10 5-Aug-15 7-Oct~ Nadiy~ 12.5 20:00 BBC One
## 5 7 10 24-Aug-~ 26-Oc~ Candi~ 13.8 20:00 BBC One
## 6 8 10 29-Aug-~ 31-Oc~ Sophi~ 9.29 20:00 Channe~
## # ... with 7 more variables: runner_up_1 <chr>, runner_up_2 <chr>,
## # season <dbl>, season_premiere <date>, season_finale <date>,
## # viewer_growth <dbl>, bbc <fct>
# Edit to move channel to first and drop episode viewer data
ratings %>%
select(-ends_with("day")) %>%
select(channel, everything())
## # A tibble: 6 x 15
## channel series episodes premiere finale winner avg_uk_viewers timeslot
## <chr> <fct> <dbl> <chr> <chr> <chr> <dbl> <time>
## 1 BBC Two 3 10 14-Aug-~ 16-Oc~ John ~ 5 20:00
## 2 BBC Two 4 10 20-Aug-~ 22-Oc~ Franc~ 7.35 20:00
## 3 BBC One 5 10 6-Aug-14 8-Oct~ Nancy~ 10.0 20:00
## 4 BBC One 6 10 5-Aug-15 7-Oct~ Nadiy~ 12.5 20:00
## 5 BBC One 7 10 24-Aug-~ 26-Oc~ Candi~ 13.8 20:00
## 6 Channe~ 8 10 29-Aug-~ 31-Oc~ Sophi~ 9.29 20:00
## # ... with 7 more variables: runner_up_1 <chr>, runner_up_2 <chr>,
## # season <dbl>, season_premiere <date>, season_finale <date>,
## # viewer_growth <dbl>, bbc <fct>
# Glimpse messy names
# glimpse(messy_ratings)
# Reformat to lower camelcase
# ratings <- messy_ratings %>%
# clean_names(case="lower_camel")
# Glimpse cleaned names
# glimpse(ratings)
# Reformat to snake case
# ratings <- messy_ratings %>%
# clean_names("snake")
# Glimpse cleaned names
# glimpse(ratings)
# Select 7-day viewer data by series
viewers_7day <- ratings %>%
select(series, contains("7day"))
# Glimpse
glimpse(viewers_7day)
## Observations: 6
## Variables: 11
## $ series <fct> 3, 4, 5, 6, 7, 8
## $ e1_viewers_7day <dbl> 3.85, 6.60, 8.51, 11.62, 13.58, 9.46
## $ e2_viewers_7day <dbl> 4.60, 6.65, 8.79, 11.59, 13.45, 9.23
## $ e3_viewers_7day <dbl> 4.53, 7.17, 9.28, 12.01, 13.01, 8.68
## $ e4_viewers_7day <dbl> 4.71, 6.82, 10.25, 12.36, 13.29, 8.55
## $ e5_viewers_7day <dbl> 4.61, 6.95, 9.95, 12.39, 13.12, 8.61
## $ e6_viewers_7day <dbl> 4.82, 7.32, 10.13, 12.00, 13.13, 8.61
## $ e7_viewers_7day <dbl> 5.10, 7.76, 10.28, 12.35, 13.45, 9.01
## $ e8_viewers_7day <dbl> 5.350, 7.410, 9.023, 11.090, 13.260, 8.950
## $ e9_viewers_7day <dbl> 5.70, 7.41, 10.67, 12.65, 13.44, 9.03
## $ e10_viewers_7day <dbl> 6.74, 9.45, 13.51, 15.05, 15.90, 10.04
# Adapt code to also rename 7-day viewer data
viewers_7day <- ratings %>%
select(series, viewers_7day_ = ends_with("7day"))
# Glimpse
glimpse(viewers_7day)
## Observations: 6
## Variables: 11
## $ series <fct> 3, 4, 5, 6, 7, 8
## $ viewers_7day_1 <dbl> 3.85, 6.60, 8.51, 11.62, 13.58, 9.46
## $ viewers_7day_2 <dbl> 4.60, 6.65, 8.79, 11.59, 13.45, 9.23
## $ viewers_7day_3 <dbl> 4.53, 7.17, 9.28, 12.01, 13.01, 8.68
## $ viewers_7day_4 <dbl> 4.71, 6.82, 10.25, 12.36, 13.29, 8.55
## $ viewers_7day_5 <dbl> 4.61, 6.95, 9.95, 12.39, 13.12, 8.61
## $ viewers_7day_6 <dbl> 4.82, 7.32, 10.13, 12.00, 13.13, 8.61
## $ viewers_7day_7 <dbl> 5.10, 7.76, 10.28, 12.35, 13.45, 9.01
## $ viewers_7day_8 <dbl> 5.350, 7.410, 9.023, 11.090, 13.260, 8.950
## $ viewers_7day_9 <dbl> 5.70, 7.41, 10.67, 12.65, 13.44, 9.03
## $ viewers_7day_10 <dbl> 6.74, 9.45, 13.51, 15.05, 15.90, 10.04
# Adapt code to drop 28-day columns; move 7-day to front
viewers_7day <- ratings %>%
select(viewers_7day_ = ends_with("7day"), everything(), -contains("28day"))
# Glimpse
glimpse(viewers_7day)
## Observations: 6
## Variables: 26
## $ viewers_7day_1 <dbl> 3.85, 6.60, 8.51, 11.62, 13.58, 9.46
## $ viewers_7day_2 <dbl> 4.60, 6.65, 8.79, 11.59, 13.45, 9.23
## $ viewers_7day_3 <dbl> 4.53, 7.17, 9.28, 12.01, 13.01, 8.68
## $ viewers_7day_4 <dbl> 4.71, 6.82, 10.25, 12.36, 13.29, 8.55
## $ viewers_7day_5 <dbl> 4.61, 6.95, 9.95, 12.39, 13.12, 8.61
## $ viewers_7day_6 <dbl> 4.82, 7.32, 10.13, 12.00, 13.13, 8.61
## $ viewers_7day_7 <dbl> 5.10, 7.76, 10.28, 12.35, 13.45, 9.01
## $ viewers_7day_8 <dbl> 5.350, 7.410, 9.023, 11.090, 13.260, 8.950
## $ viewers_7day_9 <dbl> 5.70, 7.41, 10.67, 12.65, 13.44, 9.03
## $ viewers_7day_10 <dbl> 6.74, 9.45, 13.51, 15.05, 15.90, 10.04
## $ series <fct> 3, 4, 5, 6, 7, 8
## $ episodes <dbl> 10, 10, 10, 10, 10, 10
## $ premiere <chr> "14-Aug-12", "20-Aug-13", "6-Aug-14", "5-Aug-15", "...
## $ finale <chr> "16-Oct-12", "22-Oct-13", "8-Oct-14", "7-Oct-15", "...
## $ winner <chr> "John Whaite", "Frances Quinn", "Nancy Birtwhistle"...
## $ avg_uk_viewers <dbl> 5.00, 7.35, 10.04, 12.50, 13.85, 9.29
## $ day <chr> "Tuesday", "Tuesday", "Wednesday", "Wednesday", "We...
## $ timeslot <time> 20:00:00, 20:00:00, 20:00:00, 20:00:00, 20:00:00, ...
## $ channel <chr> "BBC Two", "BBC Two", "BBC One", "BBC One", "BBC On...
## $ runner_up_1 <chr> "Brendan Lynch", "Kimberley Wilson", "Luis Troyano"...
## $ runner_up_2 <chr> "James Morton", "Ruby Tandoh", "Richard Burr", "Tam...
## $ season <dbl> NA, 1, 2, 3, 4, NA
## $ season_premiere <date> NA, 2014-12-28, 2015-09-06, 2016-07-01, 2017-06-16...
## $ season_finale <date> NA, 2015-03-01, 2015-11-08, 2016-08-12, 2017-08-04...
## $ viewer_growth <dbl> 2.89, 2.85, 5.00, 3.43, 2.32, 0.58
## $ bbc <fct> 1, 1, 1, 1, 1, 0
# Adapt code to keep original order
viewers_7day <- ratings %>%
select(everything(), -ends_with("28day"), viewers_7day_ = ends_with("7day"))
# Glimpse
glimpse(viewers_7day)
## Observations: 6
## Variables: 26
## $ series <fct> 3, 4, 5, 6, 7, 8
## $ episodes <dbl> 10, 10, 10, 10, 10, 10
## $ premiere <chr> "14-Aug-12", "20-Aug-13", "6-Aug-14", "5-Aug-15", "...
## $ finale <chr> "16-Oct-12", "22-Oct-13", "8-Oct-14", "7-Oct-15", "...
## $ winner <chr> "John Whaite", "Frances Quinn", "Nancy Birtwhistle"...
## $ avg_uk_viewers <dbl> 5.00, 7.35, 10.04, 12.50, 13.85, 9.29
## $ day <chr> "Tuesday", "Tuesday", "Wednesday", "Wednesday", "We...
## $ timeslot <time> 20:00:00, 20:00:00, 20:00:00, 20:00:00, 20:00:00, ...
## $ channel <chr> "BBC Two", "BBC Two", "BBC One", "BBC One", "BBC On...
## $ runner_up_1 <chr> "Brendan Lynch", "Kimberley Wilson", "Luis Troyano"...
## $ runner_up_2 <chr> "James Morton", "Ruby Tandoh", "Richard Burr", "Tam...
## $ season <dbl> NA, 1, 2, 3, 4, NA
## $ season_premiere <date> NA, 2014-12-28, 2015-09-06, 2016-07-01, 2017-06-16...
## $ season_finale <date> NA, 2015-03-01, 2015-11-08, 2016-08-12, 2017-08-04...
## $ viewers_7day_1 <dbl> 3.85, 6.60, 8.51, 11.62, 13.58, 9.46
## $ viewers_7day_2 <dbl> 4.60, 6.65, 8.79, 11.59, 13.45, 9.23
## $ viewers_7day_3 <dbl> 4.53, 7.17, 9.28, 12.01, 13.01, 8.68
## $ viewers_7day_4 <dbl> 4.71, 6.82, 10.25, 12.36, 13.29, 8.55
## $ viewers_7day_5 <dbl> 4.61, 6.95, 9.95, 12.39, 13.12, 8.61
## $ viewers_7day_6 <dbl> 4.82, 7.32, 10.13, 12.00, 13.13, 8.61
## $ viewers_7day_7 <dbl> 5.10, 7.76, 10.28, 12.35, 13.45, 9.01
## $ viewers_7day_8 <dbl> 5.350, 7.410, 9.023, 11.090, 13.260, 8.950
## $ viewers_7day_9 <dbl> 5.70, 7.41, 10.67, 12.65, 13.44, 9.03
## $ viewers_7day_10 <dbl> 6.74, 9.45, 13.51, 15.05, 15.90, 10.04
## $ viewer_growth <dbl> 2.89, 2.85, 5.00, 3.43, 2.32, 0.58
## $ bbc <fct> 1, 1, 1, 1, 1, 0
Chapter 3 - Tidy Your Data
Introduction to Tidy Data:
Gather:
Separate:
Spread:
Tidy multiple sets of data:
Example code includes:
ratings1 <- readr::read_csv("./RInputFiles/messy_ratings.csv")
## Parsed with column specification:
## cols(
## series = col_double(),
## e1 = col_double(),
## e2 = col_double(),
## e3 = col_double(),
## e4 = col_double(),
## e5 = col_double(),
## e6 = col_double(),
## e7 = col_double(),
## e8 = col_double(),
## e9 = col_double(),
## e10 = col_double()
## )
oldRatings <- ratings
ratings <- ratings1
ratings1
## # A tibble: 8 x 11
## series e1 e2 e3 e4 e5 e6 e7 e8 e9 e10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2.24 3 3 2.6 3.03 2.75 NA NA NA NA
## 2 2 3.1 3.53 3.82 3.6 3.83 4.25 4.42 5.06 NA NA
## 3 3 3.85 4.6 4.53 4.71 4.61 4.82 5.1 5.35 5.7 6.74
## 4 4 6.6 6.65 7.17 6.82 6.95 7.32 7.76 7.41 7.41 9.45
## 5 5 8.51 8.79 9.28 10.2 9.95 10.1 10.3 9.02 10.7 13.5
## 6 6 11.6 11.6 12.0 12.4 12.4 12 12.4 11.1 12.6 15.0
## 7 7 13.6 13.4 13.0 13.3 13.1 13.1 13.4 13.3 13.4 15.9
## 8 8 9.46 9.23 8.68 8.55 8.61 8.61 9.01 8.95 9.03 10.0
# Plot of episode 1 viewers by series
ratings %>%
ggplot(aes(x=series, y=e1)) +
geom_bar(stat="identity")
# Adapt code to plot episode 2 viewers by series
ggplot(ratings, aes(x = series, y = e2)) +
geom_col()
# Gather and count episodes
tidy_ratings <- ratings %>%
gather(key = "episode", value = "viewers_7day", -series,
factor_key = TRUE, na.rm = TRUE) %>%
arrange(series, episode) %>%
mutate(episode_count = row_number())
# Plot viewers by episode and series
ggplot(tidy_ratings, aes(x = episode_count, y = viewers_7day, fill = as.factor(series))) +
geom_col()
ratings2 <- readr::read_csv("./RInputFiles/messy_ratings2.csv")
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
ratings2$series <- as.factor(ratings2$series)
ratings2
## # A tibble: 8 x 21
## series e1_7day e1_28day e2_7day e2_28day e3_7day e3_28day e4_7day e4_28day
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2.24 NA 3 NA 3 NA 2.6 NA
## 2 2 3.1 NA 3.53 NA 3.82 NA 3.6 NA
## 3 3 3.85 NA 4.6 NA 4.53 NA 4.71 NA
## 4 4 6.6 NA 6.65 NA 7.17 NA 6.82 NA
## 5 5 8.51 NA 8.79 NA 9.28 NA 10.2 NA
## 6 6 11.6 11.7 11.6 11.8 12.0 NA 12.4 12.7
## 7 7 13.6 13.9 13.4 13.7 13.0 13.4 13.3 13.9
## 8 8 9.46 9.72 9.23 9.53 8.68 9.06 8.55 8.87
## # ... with 12 more variables: e5_7day <dbl>, e5_28day <dbl>, e6_7day <dbl>,
## # e6_28day <dbl>, e7_7day <dbl>, e7_28day <dbl>, e8_7day <dbl>,
## # e8_28day <dbl>, e9_7day <dbl>, e9_28day <dbl>, e10_7day <dbl>,
## # e10_28day <dbl>
# Gather 7-day viewers by episode (ratings2 already loaded)
week_ratings <- ratings2 %>%
select(series, ends_with("7day")) %>%
gather(episode, viewers_7day, ends_with("7day"), na.rm = TRUE, factor_key = TRUE)
# Plot 7-day viewers by episode and series
ggplot(week_ratings, aes(x = episode, y = viewers_7day, group = series)) +
geom_line() +
facet_wrap(~series)
# Edit to parse episode number
week_ratings <- ratings2 %>%
select(series, ends_with("7day")) %>%
gather(episode, viewers_7day, ends_with("7day"), na.rm = TRUE) %>%
separate(episode, into = "episode", extra = "drop") %>%
mutate(episode = parse_number(episode))
# Edit your code to color by series and add a theme
ggplot(week_ratings, aes(x = episode, y = viewers_7day,
group = series, color = series)) +
geom_line() +
facet_wrap(~series) +
guides(color = FALSE) +
theme_minimal()
week_ratings_dec <- week_ratings %>%
mutate(viewers_7day=as.character(viewers_7day)) %>%
separate(viewers_7day, into=c("viewers_millions", "viewers_decimal"), sep="\\.") %>%
mutate(viewers_decimal=ifelse(is.na(viewers_decimal), ".", paste0(".", viewers_decimal))) %>%
dplyr::arrange(series, episode)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 3 rows [9, 17,
## 46].
# Unite series and episode
ratings3 <- week_ratings_dec %>%
unite("viewers_7day", viewers_millions, viewers_decimal)
# Print to view
ratings3
## # A tibble: 74 x 3
## series episode viewers_7day
## <fct> <dbl> <chr>
## 1 1 1 2_.24
## 2 1 2 3_.
## 3 1 3 3_.
## 4 1 4 2_.6
## 5 1 5 3_.03
## 6 1 6 2_.75
## 7 2 1 3_.1
## 8 2 2 3_.53
## 9 2 3 3_.82
## 10 2 4 3_.6
## # ... with 64 more rows
# Adapt to change the separator
ratings3 <- week_ratings_dec %>%
unite(viewers_7day, viewers_millions, viewers_decimal, sep="")
# Print to view
ratings3
## # A tibble: 74 x 3
## series episode viewers_7day
## <fct> <dbl> <chr>
## 1 1 1 2.24
## 2 1 2 3.
## 3 1 3 3.
## 4 1 4 2.6
## 5 1 5 3.03
## 6 1 6 2.75
## 7 2 1 3.1
## 8 2 2 3.53
## 9 2 3 3.82
## 10 2 4 3.6
## # ... with 64 more rows
# Adapt to cast viewers as a number
ratings3 <- week_ratings_dec %>%
unite(viewers_7day, viewers_millions, viewers_decimal, sep="") %>%
mutate(viewers_7day = parse_number(viewers_7day))
# Print to view
ratings3
## # A tibble: 74 x 3
## series episode viewers_7day
## <fct> <dbl> <dbl>
## 1 1 1 2.24
## 2 1 2 3
## 3 1 3 3
## 4 1 4 2.6
## 5 1 5 3.03
## 6 1 6 2.75
## 7 2 1 3.1
## 8 2 2 3.53
## 9 2 3 3.82
## 10 2 4 3.6
## # ... with 64 more rows
# Create tidy data with 7- and 28-day viewers
tidy_ratings_all <- ratings2 %>%
gather(episode, viewers, ends_with("day"), na.rm = TRUE) %>%
separate(episode, into = c("episode", "days")) %>%
mutate(episode = parse_number(episode),
days = parse_number(days))
# Adapt to spread counted values
tidy_ratings_all %>%
count(series, days, wt = viewers) %>%
spread(key=days, value=n, sep="_")
## # A tibble: 8 x 3
## series days_7 days_28
## <fct> <dbl> <dbl>
## 1 1 16.6 NA
## 2 2 31.6 NA
## 3 3 50.0 NA
## 4 4 73.5 NA
## 5 5 100. NA
## 6 6 123. 113
## 7 7 136. 138.
## 8 8 90.2 92.9
# Fill in blanks to get premiere/finale data
tidy_ratings <- ratings %>%
gather(episode, viewers, -series, na.rm = TRUE) %>%
mutate(episode = parse_number(episode)) %>%
group_by(series) %>%
filter(episode == 1 | episode == max(episode)) %>%
ungroup()
# Recode first/last episodes
first_last <- tidy_ratings %>%
mutate(episode = recode(episode, `1` = "first", .default = "last"))
# Fill in to make slope chart
ggplot(first_last, aes(x = episode, y = viewers, color = as.factor(series))) +
geom_point() +
geom_line(aes(group = series))
# Switch the variables mapping x-axis and color
ggplot(first_last, aes(x = series, y = viewers, color = episode)) +
geom_point() + # keep
geom_line(aes(group = series)) + # keep
coord_flip() # keep
# Calculate relative increase in viewers
bump_by_series <- first_last %>%
spread(episode, viewers) %>%
mutate(bump = (last - first) / first)
# Fill in to make bar chart of bumps by series
ggplot(bump_by_series, aes(x = series, y = bump)) +
geom_col() +
scale_y_continuous(labels = scales::percent) # converts to %
Chapter 4 - Transform Your Data
Complex recoding with case_when:
Factors:
Dates:
Strings:
Final thoughts:
Example code includes:
baker_results <- readr::read_csv("./RInputFiles/baker_results.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## baker_full = col_character(),
## baker = col_character(),
## occupation = col_character(),
## hometown = col_character(),
## baker_last = col_character(),
## baker_first = col_character(),
## first_date_appeared = col_date(format = ""),
## last_date_appeared = col_date(format = ""),
## first_date_us = col_date(format = ""),
## last_date_us = col_date(format = "")
## )
## See spec(...) for full column specifications.
messy_baker_results <- readr::read_csv("./RInputFiles/messy_baker_results.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## series = col_double(),
## star_baker = col_double(),
## technical_winner = col_double(),
## technical_top3 = col_double(),
## technical_bottom = col_double(),
## technical_highest = col_double(),
## technical_lowest = col_double(),
## technical_median = col_double(),
## series_winner = col_double(),
## series_runner_up = col_double(),
## total_episodes_appeared = col_double(),
## percent_episodes_appeared = col_double(),
## percent_technical_top3 = col_double(),
## first_date_appeared_uk = col_date(format = ""),
## last_date_appeared_uk = col_date(format = ""),
## first_date_us = col_date(format = ""),
## last_date_us = col_date(format = ""),
## e_1_technical = col_double(),
## e_10_technical = col_double(),
## e_2_technical = col_double()
## # ... with 7 more columns
## )
## See spec(...) for full column specifications.
bakers <- baker_results
glimpse(bakers)
## Observations: 95
## Variables: 24
## $ series <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,...
## $ baker_full <chr> "Annetha Mills", "David Chambers", "Edwar...
## $ baker <chr> "Annetha", "David", "Edd", "Jasminder", "...
## $ age <dbl> 30, 31, 24, 45, 25, 51, 44, 48, 37, 31, 3...
## $ occupation <chr> "Single mother", "Entrepreneur", "Debt co...
## $ hometown <chr> "Essex", "Milton Keynes", "Bradford", "Bi...
## $ baker_last <chr> "Mills", "Chambers", "Kimber", "Randhawa"...
## $ baker_first <chr> "Annetha", "David", "Edward", "Jasminder"...
## $ star_baker <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1,...
## $ technical_winner <dbl> 0, 0, 2, 0, 1, 0, 0, 0, 2, 0, 1, 2, 0, 1,...
## $ technical_top3 <dbl> 1, 1, 4, 2, 1, 0, 0, 0, 4, 2, 3, 5, 1, 1,...
## $ technical_bottom <dbl> 1, 3, 1, 2, 2, 1, 1, 0, 1, 2, 1, 3, 2, 6,...
## $ technical_highest <dbl> 2, 3, 1, 2, 1, 10, 4, NA, 1, 2, 1, 1, 2, ...
## $ technical_lowest <dbl> 7, 8, 6, 5, 9, 10, 4, NA, 8, 5, 5, 6, 10,...
## $ technical_median <dbl> 4.5, 4.5, 2.0, 3.0, 6.0, 10.0, 4.0, NA, 3...
## $ series_winner <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ series_runner_up <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,...
## $ total_episodes_appeared <dbl> 2, 4, 6, 5, 3, 1, 2, 1, 6, 6, 4, 8, 3, 7,...
## $ first_date_appeared <date> 2010-08-17, 2010-08-17, 2010-08-17, 2010...
## $ last_date_appeared <date> 2010-08-24, 2010-09-07, 2010-09-21, 2010...
## $ first_date_us <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ last_date_us <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ percent_episodes_appeared <dbl> 33.33333, 66.66667, 100.00000, 83.33333, ...
## $ percent_technical_top3 <dbl> 50.00000, 25.00000, 66.66667, 40.00000, 3...
# Create skill variable with 3 levels
bakers <- bakers %>%
mutate(skill = case_when(
star_baker > technical_winner ~ "super_star",
star_baker < technical_winner ~ "high_tech",
TRUE ~ "well_rounded"
))
# Filter zeroes to examine skill variable
bakers %>%
filter(star_baker==0 & technical_winner==0) %>%
count(skill)
## # A tibble: 1 x 2
## skill n
## <chr> <int>
## 1 well_rounded 41
# Add pipe to drop skill = NA
bakers_skill <- bakers %>%
mutate(skill = case_when(
star_baker > technical_winner ~ "super_star",
star_baker < technical_winner ~ "high_tech",
star_baker == 0 & technical_winner == 0 ~ NA_character_,
star_baker == technical_winner ~ "well_rounded"
)) %>%
drop_na(skill)
# Count bakers by skill
bakers_skill %>%
count(skill)
## # A tibble: 3 x 2
## skill n
## <chr> <int>
## 1 high_tech 24
## 2 super_star 15
## 3 well_rounded 15
# Cast skill as a factor
bakers <- bakers %>%
mutate(skill = as.factor(skill))
# Examine levels
bakers %>%
pull(skill) %>%
levels()
## [1] "high_tech" "super_star" "well_rounded"
baker_dates <- bakers %>%
select(series, baker, contains("date")) %>%
mutate(last_date_appeared_us=as.character(last_date_us),
first_date_appeared_us=as.character(first_date_us)
) %>%
rename(first_date_appeared_uk=first_date_appeared, last_date_appeared_uk=last_date_appeared) %>%
select(-last_date_us, -first_date_us)
glimpse(baker_dates)
## Observations: 95
## Variables: 6
## $ series <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,...
## $ baker <chr> "Annetha", "David", "Edd", "Jasminder", "Jon...
## $ first_date_appeared_uk <date> 2010-08-17, 2010-08-17, 2010-08-17, 2010-08...
## $ last_date_appeared_uk <date> 2010-08-24, 2010-09-07, 2010-09-21, 2010-09...
## $ last_date_appeared_us <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ first_date_appeared_us <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# Add a line to extract labeled month
baker_dates <- baker_dates %>%
mutate(last_date_appeared_us=lubridate::ymd(last_date_appeared_us),
last_month_us=lubridate::month(last_date_appeared_us, label=TRUE)
)
ggplot(baker_dates, aes(x=last_month_us)) + geom_bar()
baker_time <- baker_dates %>%
mutate(first_date_appeared_us=lubridate::ymd(first_date_appeared_us)) %>%
select(-last_month_us)
glimpse(baker_time)
## Observations: 95
## Variables: 6
## $ series <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,...
## $ baker <chr> "Annetha", "David", "Edd", "Jasminder", "Jon...
## $ first_date_appeared_uk <date> 2010-08-17, 2010-08-17, 2010-08-17, 2010-08...
## $ last_date_appeared_uk <date> 2010-08-24, 2010-09-07, 2010-09-21, 2010-09...
## $ last_date_appeared_us <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ first_date_appeared_us <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
# Add a line to create whole months on air variable
baker_time <- baker_time %>%
mutate(time_on_air = lubridate::interval(first_date_appeared_uk, last_date_appeared_uk),
weeks_on_air = time_on_air / lubridate::weeks(1),
months_on_air = time_on_air %/% months(1)
)
# Count rows
messy_baker_results %>%
count(position_reached)
## # A tibble: 8 x 2
## position_reached n
## <chr> <int>
## 1 Runner-Up 1
## 2 Runner up 2
## 3 Runner Up 12
## 4 Third Place 1
## 5 winner 2
## 6 Winner 1
## 7 WINNER 5
## 8 <NA> 71
# Add another mutate to replace "THIRD PLACE" with "RUNNER UP"and count
messy_baker_results <- messy_baker_results %>%
mutate(position_reached = str_to_upper(position_reached),
position_reached = str_replace(position_reached, "-", " "),
position_reached = str_replace(position_reached, "THIRD PLACE", "RUNNER UP"))
# Count rows
messy_baker_results %>%
count(position_reached)
## # A tibble: 3 x 2
## position_reached n
## <chr> <int>
## 1 RUNNER UP 16
## 2 WINNER 8
## 3 <NA> 71
# Add a line to create new variable called student
bakers <- bakers %>%
mutate(occupation = str_to_lower(occupation),
student=str_detect(occupation, "student")
)
# Find all students and examine occupations
bakers %>%
filter(student) %>%
select(baker, occupation, student)
## # A tibble: 8 x 3
## baker occupation student
## <chr> <chr> <lgl>
## 1 Jason civil engineering student TRUE
## 2 James medical student TRUE
## 3 John law student TRUE
## 4 Ruby history of art and philosophy student TRUE
## 5 Martha student TRUE
## 6 Michael student TRUE
## 7 Rav student support TRUE
## 8 Liam student TRUE
Chapter 1 - Introduction to Modeling
Background on modeling for explanation:
Background on modeling for prediction:
Modeling problem for explanation:
Modeling problem for prediction:
Example code includes:
data(evals, package="moderndive")
glimpse(evals)
## Observations: 463
## Variables: 14
## $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ prof_ID <int> 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, ...
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8,...
## $ age <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 40...
## $ bty_avg <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000, 3.333...
## $ gender <fct> female, female, female, female, male, male, male, male...
## $ ethnicity <fct> minority, minority, minority, minority, not minority, ...
## $ language <fct> english, english, english, english, english, english, ...
## $ rank <fct> tenure track, tenure track, tenure track, tenure track...
## $ pic_outfit <fct> not formal, not formal, not formal, not formal, not fo...
## $ pic_color <fct> color, color, color, color, color, color, color, color...
## $ cls_did_eval <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24, 17, 1...
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, 25, 20...
## $ cls_level <fct> upper, upper, upper, upper, upper, upper, upper, upper...
# Plot the histogram
ggplot(evals, aes(x = age)) +
geom_histogram(binwidth = 5) +
labs(x = "age", y = "count")
# Compute summary stats
evals %>%
summarize(mean_age = mean(age),
median_age = median(age),
sd_age = sd(age))
## # A tibble: 1 x 3
## mean_age median_age sd_age
## <dbl> <int> <dbl>
## 1 48.4 48 9.80
data(house_prices, package="moderndive")
glimpse(house_prices)
## Observations: 21,613
## Variables: 21
## $ id <chr> "7129300520", "6414100192", "5631500400", "2487200875...
## $ date <date> 2014-10-13, 2014-12-09, 2015-02-25, 2014-12-09, 2015...
## $ price <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, 2575...
## $ bedrooms <int> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4,...
## $ bathrooms <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00,...
## $ sqft_living <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, ...
## $ sqft_lot <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 74...
## $ floors <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0...
## $ waterfront <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS...
## $ view <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0,...
## $ condition <fct> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4,...
## $ grade <fct> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7...
## $ sqft_above <int> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, ...
## $ sqft_basement <int> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, ...
## $ yr_built <int> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960,...
## $ yr_renovated <int> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ zipcode <fct> 98178, 98125, 98028, 98136, 98074, 98053, 98003, 9819...
## $ lat <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561,...
## $ long <dbl> -122.257, -122.319, -122.233, -122.393, -122.045, -12...
## $ sqft_living15 <int> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780,...
## $ sqft_lot15 <int> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 811...
# Plot the histogram
ggplot(house_prices, aes(x = sqft_living)) +
geom_histogram() +
labs(x="Size (sq.feet)", y="count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Add log10_sqft_living
house_prices_2 <- house_prices %>%
mutate(log10_sqft_living = log10(sqft_living))
# Plot the histogram
ggplot(house_prices_2, aes(x = log10_sqft_living)) +
geom_histogram() +
labs(x = "log10 size", y = "count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot the histogram
ggplot(evals, aes(x=bty_avg)) +
geom_histogram(binwidth=0.5) +
labs(x = "Beauty score", y = "count")
# Scatterplot
ggplot(evals, aes(x = bty_avg, y = score)) +
geom_point() +
labs(x = "beauty score", y = "teaching score")
# Jitter plot
ggplot(evals, aes(x = bty_avg, y = score)) +
geom_jitter() +
labs(x = "beauty score", y = "teaching score")
# Compute correlation
evals %>%
summarize(correlation = cor(score, bty_avg))
## # A tibble: 1 x 1
## correlation
## <dbl>
## 1 0.187
house_prices <- house_prices %>%
mutate(log10_price=log10(price))
# View the structure of log10_price and waterfront
house_prices %>%
select(log10_price, waterfront) %>%
glimpse()
## Observations: 21,613
## Variables: 2
## $ log10_price <dbl> 5.346157, 5.730782, 5.255273, 5.781037, 5.707570, 6.088...
## $ waterfront <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
# Plot
ggplot(house_prices, aes(x = waterfront, y = log10_price)) +
geom_boxplot() +
labs(x = "waterfront", y = "log10 price")
# Calculate stats
house_prices %>%
group_by(waterfront) %>%
summarize(mean_log10_price = mean(log10_price), n = n())
## # A tibble: 2 x 3
## waterfront mean_log10_price n
## <lgl> <dbl> <int>
## 1 FALSE 5.66 21450
## 2 TRUE 6.12 163
# Prediction of price for houses with view
10^(6.12)
## [1] 1318257
# Prediction of price for houses without view
10^(5.66)
## [1] 457088.2
Chapter 2 - Modeling with Regression
Explaining teaching score with age:
Predicting teaching score using age:
Explaining teaching score with gender:
Predicting teaching score with gender:
Example code includes:
# Plot
ggplot(evals, aes(x = bty_avg, y = score)) +
geom_point() +
labs(x = "beauty score", y = "score") +
geom_smooth(method = "lm", se = FALSE)
# Fit model
model_score_2 <- lm(score ~ bty_avg, data = evals)
# Output content
model_score_2
##
## Call:
## lm(formula = score ~ bty_avg, data = evals)
##
## Coefficients:
## (Intercept) bty_avg
## 3.88034 0.06664
# Output regression table
moderndive::get_regression_table(model_score_2)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 3.88 0.076 51.0 0 3.73 4.03
## 2 bty_avg 0.067 0.016 4.09 0 0.035 0.099
# Use fitted intercept and slope to get a prediction
y_hat <- 3.88 + 0.067 * 5
y_hat
## [1] 4.215
# Compute residual y - y_hat
4.7 - y_hat
## [1] 0.485
# Get regression table
moderndive::get_regression_table(model_score_2, digits = 5)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 3.88 0.0761 51.0 0 3.73 4.03
## 2 bty_avg 0.0666 0.0163 4.09 0.00005 0.0346 0.0986
# Get all fitted/predicted values and residuals
moderndive::get_regression_points(model_score_2)
## # A tibble: 463 x 5
## ID score bty_avg score_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 4.7 5 4.21 0.486
## 2 2 4.1 5 4.21 -0.114
## 3 3 3.9 5 4.21 -0.314
## 4 4 4.8 5 4.21 0.586
## 5 5 4.6 3 4.08 0.52
## 6 6 4.3 3 4.08 0.22
## 7 7 2.8 3 4.08 -1.28
## 8 8 4.1 3.33 4.10 -0.002
## 9 9 3.4 3.33 4.10 -0.702
## 10 10 4.5 3.17 4.09 0.409
## # ... with 453 more rows
# Get all fitted/predicted values and residuals
moderndive::get_regression_points(model_score_2) %>%
mutate(score_hat_2 = 3.88 + 0.0666 * bty_avg)
## # A tibble: 463 x 6
## ID score bty_avg score_hat residual score_hat_2
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 4.7 5 4.21 0.486 4.21
## 2 2 4.1 5 4.21 -0.114 4.21
## 3 3 3.9 5 4.21 -0.314 4.21
## 4 4 4.8 5 4.21 0.586 4.21
## 5 5 4.6 3 4.08 0.52 4.08
## 6 6 4.3 3 4.08 0.22 4.08
## 7 7 2.8 3 4.08 -1.28 4.08
## 8 8 4.1 3.33 4.10 -0.002 4.10
## 9 9 3.4 3.33 4.10 -0.702 4.10
## 10 10 4.5 3.17 4.09 0.409 4.09
## # ... with 453 more rows
# Get all fitted/predicted values and residuals
moderndive::get_regression_points(model_score_2) %>%
mutate(residual_2 = score - score_hat)
## # A tibble: 463 x 6
## ID score bty_avg score_hat residual residual_2
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 4.7 5 4.21 0.486 0.486
## 2 2 4.1 5 4.21 -0.114 -0.114
## 3 3 3.9 5 4.21 -0.314 -0.314
## 4 4 4.8 5 4.21 0.586 0.586
## 5 5 4.6 3 4.08 0.52 0.520
## 6 6 4.3 3 4.08 0.22 0.220
## 7 7 2.8 3 4.08 -1.28 -1.28
## 8 8 4.1 3.33 4.10 -0.002 -0.002
## 9 9 3.4 3.33 4.10 -0.702 -0.702
## 10 10 4.5 3.17 4.09 0.409 0.409
## # ... with 453 more rows
ggplot(evals, aes(x=rank, y=score)) +
geom_boxplot() +
labs(x = "rank", y = "score")
evals %>%
group_by(rank) %>%
summarize(n = n(), mean_score = mean(score), sd_score = sd(score))
## # A tibble: 3 x 4
## rank n mean_score sd_score
## <fct> <int> <dbl> <dbl>
## 1 teaching 102 4.28 0.498
## 2 tenure track 108 4.15 0.561
## 3 tenured 253 4.14 0.550
# Fit regression model
model_score_4 <- lm(score ~ rank, data = evals)
# Get regression table
moderndive::get_regression_table(model_score_4, digits = 5)
## # A tibble: 3 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 4.28 0.0536 79.9 0 4.18 4.39
## 2 ranktenure track -0.130 0.0748 -1.73 0.0837 -0.277 0.0173
## 3 ranktenured -0.145 0.0636 -2.28 0.0228 -0.270 -0.0203
# teaching mean
teaching_mean <- 4.28
# tenure track mean
tenure_track_mean <- 4.28-0.13
# tenure mean
tenure_mean <- 4.28-0.145
# Calculate predictions and residuals
model_score_4_points <- moderndive::get_regression_points(model_score_4)
model_score_4_points
## # A tibble: 463 x 5
## ID score rank score_hat residual
## <int> <dbl> <fct> <dbl> <dbl>
## 1 1 4.7 tenure track 4.16 0.545
## 2 2 4.1 tenure track 4.16 -0.055
## 3 3 3.9 tenure track 4.16 -0.255
## 4 4 4.8 tenure track 4.16 0.645
## 5 5 4.6 tenured 4.14 0.461
## 6 6 4.3 tenured 4.14 0.161
## 7 7 2.8 tenured 4.14 -1.34
## 8 8 4.1 tenured 4.14 -0.039
## 9 9 3.4 tenured 4.14 -0.739
## 10 10 4.5 tenured 4.14 0.361
## # ... with 453 more rows
# Plot residuals
ggplot(model_score_4_points, aes(x=residual)) +
geom_histogram() +
labs(x = "residuals", title = "Residuals from score ~ rank model")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Chapter 3 - Modeling with Multiple Regression
Explaining house price with year and size:
Predicting house price using year and size:
Explaining house price with size and condition:
Predicting house price using size and condition:
Example code includes:
# Create scatterplot with regression line
ggplot(house_prices, aes(x=bedrooms, y = log10_price)) +
geom_point() +
labs(x = "Number of bedrooms", y = "log10 price") +
geom_smooth(method = "lm", se = FALSE)
# Remove outlier
house_prices_transform <- house_prices %>%
filter(bedrooms < 33) %>%
mutate(log10_sqft_living=log10(sqft_living))
# Create scatterplot with regression line
ggplot(house_prices_transform, aes(x = bedrooms, y = log10_price)) +
geom_point() +
labs(x = "Number of bedrooms", y = "log10 price") +
geom_smooth(method = "lm", se = FALSE)
# Fit model
model_price_2 <- lm(log10_price ~ log10_sqft_living + bedrooms, data = house_prices_transform)
# Get regression table
moderndive::get_regression_table(model_price_2)
## # A tibble: 3 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 2.69 0.023 116. 0 2.65 2.74
## 2 log10_sqft_living 0.941 0.008 118. 0 0.925 0.957
## 3 bedrooms -0.033 0.002 -20.5 0 -0.036 -0.03
# Make prediction in log10 dollars
2.69 + 0.941 * log10(1000) - 0.033 * 3
## [1] 5.414
# Make prediction dollars
10**(2.69 + 0.941 * log10(1000) - 0.033 * 3)
## [1] 259417.9
# Automate prediction and residual computation
moderndive::get_regression_points(model_price_2) %>%
mutate(squared_residuals = residual**2) %>%
summarize(sum_squared_residuals = sum(squared_residuals))
## # A tibble: 1 x 1
## sum_squared_residuals
## <dbl>
## 1 604.
# Fit model
model_price_4 <- lm(log10_price ~ log10_sqft_living + waterfront, data = house_prices_transform)
# Get regression table
moderndive::get_regression_table(model_price_4)
## # A tibble: 3 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 2.96 0.02 146. 0 2.92 3.00
## 2 log10_sqft_living 0.825 0.006 134. 0 0.813 0.837
## 3 waterfrontTRUE 0.322 0.013 24.5 0 0.296 0.348
# Prediction for House A
10**(2.96 + 0.825*2.9 + 0.322)
## [1] 472606.8
# Prediction for House B
10**(2.96 + 0.825*3.1 + 0)
## [1] 329230.5
# View the "new" houses
new_houses_2 <- tibble(log10_sqft_living=c(2.9, 3.1), waterfront=c(TRUE, FALSE))
new_houses_2
## # A tibble: 2 x 2
## log10_sqft_living waterfront
## <dbl> <lgl>
## 1 2.9 TRUE
## 2 3.1 FALSE
# Get predictions price_hat in dollars on "new" houses
moderndive::get_regression_points(model_price_4, newdata = new_houses_2) %>%
mutate(price_hat = 10**log10_price_hat)
## # A tibble: 2 x 5
## ID log10_sqft_living waterfront log10_price_hat price_hat
## <int> <dbl> <lgl> <dbl> <dbl>
## 1 1 2.9 TRUE 5.67 472063.
## 2 2 3.1 FALSE 5.52 328095.
Chapter 4 - Model Selection and Assessment
Model selection and assessment:
Assessing model fit with R-squared:
Assessing predictions with RMSE:
Validation set prediction framework:
Next steps:
Example code includes:
# Model 2
model_price_2 <- lm(log10_price ~ log10_sqft_living + bedrooms, data = house_prices_transform)
# Calculate squared residuals
moderndive::get_regression_points(model_price_2) %>%
mutate(sq_residuals=residual**2) %>%
summarize(sum_sq_residuals=sum(sq_residuals))
## # A tibble: 1 x 1
## sum_sq_residuals
## <dbl>
## 1 604.
# Model 4
model_price_4 <- lm(log10_price ~ log10_sqft_living + waterfront, data = house_prices_transform)
# Calculate squared residuals
moderndive::get_regression_points(model_price_4) %>%
mutate(sq_residuals = residual**2) %>%
summarize(sum_sq_residuals=sum(sq_residuals))
## # A tibble: 1 x 1
## sum_sq_residuals
## <dbl>
## 1 599.
# Get fitted/values & residuals, compute R^2 using residuals
moderndive::get_regression_points(model_price_2) %>%
summarize(r_squared = 1 - var(residual) / var(log10_price))
## # A tibble: 1 x 1
## r_squared
## <dbl>
## 1 0.466
# Get fitted/values & residuals, compute R^2 using residuals
moderndive::get_regression_points(model_price_4) %>%
summarize(r_squared = 1 - var(residual) / var(log10_price))
## # A tibble: 1 x 1
## r_squared
## <dbl>
## 1 0.470
# Get all residuals, square them, take the mean and square root
moderndive::get_regression_points(model_price_2) %>%
mutate(sq_residuals = residual^2) %>%
summarize(mse = mean(sq_residuals)) %>%
mutate(rmse = sqrt(mse))
## # A tibble: 1 x 2
## mse rmse
## <dbl> <dbl>
## 1 0.0279 0.167
# MSE and RMSE for model_price_2
moderndive::get_regression_points(model_price_2) %>%
mutate(sq_residuals = residual^2) %>%
summarize(mse = mean(sq_residuals), rmse = sqrt(mean(sq_residuals)))
## # A tibble: 1 x 2
## mse rmse
## <dbl> <dbl>
## 1 0.0279 0.167
# MSE and RMSE for model_price_4
moderndive::get_regression_points(model_price_4) %>%
mutate(sq_residuals = residual^2) %>%
summarize(mse = mean(sq_residuals), rmse = sqrt(mean(sq_residuals)))
## # A tibble: 1 x 2
## mse rmse
## <dbl> <dbl>
## 1 0.0277 0.166
# Set random number generator seed value for reproducibility
set.seed(76)
# Randomly reorder the rows
house_prices_shuffled <- house_prices_transform %>%
sample_frac(size = 1, replace = FALSE)
# Train/test split
train <- house_prices_shuffled %>%
slice(1:10000)
test <- house_prices_shuffled %>%
slice(10001:nrow(.))
# Fit model to training set
train_model_2 <- lm(log10_price ~ log10_sqft_living + bedrooms, data=train)
# Compute RMSE (train)
moderndive::get_regression_points(train_model_2) %>%
mutate(sq_residuals = residual**2) %>%
summarize(rmse = sqrt(mean(sq_residuals)))
## # A tibble: 1 x 1
## rmse
## <dbl>
## 1 0.167
# Compute RMSE (test)
moderndive::get_regression_points(train_model_2, newdata = test) %>%
mutate(sq_residuals = residual**2) %>%
summarize(rmse = sqrt(mean(sq_residuals)))
## # A tibble: 1 x 1
## rmse
## <dbl>
## 1 0.167
Chapter 1 - Introduction to Survey Data
What are survey weights?
Specifying elements of the design in R:
Visualizing impact of survey weights:
Example code includes:
colTypes <- "FINLWT21 numeric _ FINCBTAX integer _ BLS_URBN integer _ POPSIZE integer _ EDUC_REF character _ EDUCA2 character _ AGE_REF integer _ AGE2 character _ SEX_REF integer _ SEX2 integer _ REF_RACE integer _ RACE2 integer _ HISP_REF integer _ HISP2 integer _ FAM_TYPE integer _ MARITAL1 integer _ REGION integer _ SMSASTAT integer _ HIGH_EDU character _ EHOUSNGC numeric _ TOTEXPCQ numeric _ FOODCQ numeric _ TRANSCQ numeric _ HEALTHCQ numeric _ ENTERTCQ numeric _ EDUCACQ integer _ TOBACCCQ numeric _ STUDFINX character _ IRAX character _ CUTENURE integer _ FAM_SIZE integer _ VEHQ integer _ ROOMSQ character _ INC_HRS1 character _ INC_HRS2 character _ EARNCOMP integer _ NO_EARNR integer _ OCCUCOD1 character _ OCCUCOD2 character _ STATE character _ DIVISION integer _ TOTXEST integer _ CREDFINX character _ CREDITB integer _ CREDITX character _ BUILDING character _ ST_HOUS integer _ INT_PHON character _ INT_HOME character _ "
ce <- readr::read_csv("./RInputFiles/ce.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## EDUC_REF = col_character(),
## EDUCA2 = col_character(),
## AGE2 = col_character(),
## HIGH_EDU = col_character(),
## STUDFINX = col_character(),
## IRAX = col_character(),
## ROOMSQ = col_character(),
## INC_HRS1 = col_character(),
## INC_HRS2 = col_character(),
## OCCUCOD1 = col_character(),
## OCCUCOD2 = col_character(),
## STATE = col_character(),
## CREDFINX = col_character(),
## CREDITX = col_character(),
## BUILDING = col_character(),
## INT_PHON = col_logical(),
## INT_HOME = col_logical()
## )
## See spec(...) for full column specifications.
glimpse(ce)
## Observations: 6,301
## Variables: 49
## $ FINLWT21 <dbl> 25984.767, 6581.018, 20208.499, 18078.372, 20111.619, 1990...
## $ FINCBTAX <dbl> 116920, 200, 117000, 0, 2000, 942, 0, 91000, 95000, 40037,...
## $ BLS_URBN <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ POPSIZE <dbl> 2, 3, 4, 2, 2, 2, 1, 2, 5, 2, 3, 2, 2, 3, 4, 3, 3, 1, 4, 1...
## $ EDUC_REF <chr> "16", "15", "16", "15", "14", "11", "10", "13", "12", "12"...
## $ EDUCA2 <chr> "15", "15", "13", NA, NA, NA, NA, "15", "15", "14", "12", ...
## $ AGE_REF <dbl> 63, 50, 47, 37, 51, 63, 77, 37, 51, 64, 26, 59, 81, 51, 67...
## $ AGE2 <chr> "50", "47", "46", ".", ".", ".", ".", "36", "53", "67", "4...
## $ SEX_REF <dbl> 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2...
## $ SEX2 <dbl> 2, 2, 1, NA, NA, NA, NA, 2, 2, 1, 1, 1, NA, NA, NA, 1, NA,...
## $ REF_RACE <dbl> 1, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1...
## $ RACE2 <dbl> 1, 4, 1, NA, NA, NA, NA, 1, 1, 1, 1, 1, NA, NA, NA, 2, NA,...
## $ HISP_REF <dbl> 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1...
## $ HISP2 <dbl> 2, 2, 1, NA, NA, NA, NA, 2, 2, 2, 2, 2, NA, NA, NA, 2, NA,...
## $ FAM_TYPE <dbl> 3, 4, 1, 8, 9, 9, 8, 3, 1, 1, 3, 1, 8, 9, 8, 5, 9, 4, 8, 3...
## $ MARITAL1 <dbl> 1, 1, 1, 5, 3, 3, 2, 1, 1, 1, 1, 1, 2, 3, 5, 1, 3, 1, 3, 1...
## $ REGION <dbl> 4, 4, 3, 4, 4, 3, 4, 1, 3, 2, 1, 4, 1, 3, 3, 3, 2, 1, 2, 4...
## $ SMSASTAT <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ HIGH_EDU <chr> "16", "15", "16", "15", "14", "11", "10", "15", "15", "14"...
## $ EHOUSNGC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ TOTEXPCQ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ FOODCQ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ TRANSCQ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ HEALTHCQ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ ENTERTCQ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ EDUCACQ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ TOBACCCQ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ STUDFINX <chr> ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", "."...
## $ IRAX <chr> "1000000", "10000", "0", ".", ".", "0", "0", "15000", ".",...
## $ CUTENURE <dbl> 1, 1, 1, 1, 1, 2, 4, 1, 1, 2, 1, 2, 2, 2, 2, 4, 1, 1, 1, 4...
## $ FAM_SIZE <dbl> 4, 6, 2, 1, 2, 2, 1, 5, 2, 2, 4, 2, 1, 2, 1, 4, 2, 4, 1, 3...
## $ VEHQ <dbl> 3, 5, 0, 4, 2, 0, 0, 2, 4, 2, 3, 2, 1, 3, 1, 2, 4, 4, 0, 2...
## $ ROOMSQ <chr> "8", "5", "6", "4", "4", "4", "7", "5", "4", "9", "6", "10...
## $ INC_HRS1 <chr> "40", "40", "40", "44", "40", ".", ".", "40", "40", ".", "...
## $ INC_HRS2 <chr> "30", "40", "52", ".", ".", ".", ".", "40", "40", ".", "65...
## $ EARNCOMP <dbl> 3, 2, 2, 1, 4, 7, 8, 2, 2, 8, 2, 8, 8, 7, 8, 2, 7, 3, 1, 2...
## $ NO_EARNR <dbl> 4, 2, 2, 1, 2, 1, 0, 2, 2, 0, 2, 0, 0, 1, 0, 2, 1, 3, 1, 2...
## $ OCCUCOD1 <chr> "03", "03", "05", "03", "04", NA, NA, "12", "04", NA, "01"...
## $ OCCUCOD2 <chr> "04", "02", "01", NA, NA, NA, NA, "02", "03", NA, "11", NA...
## $ STATE <chr> "41", "15", "48", "06", "06", "48", "06", "42", NA, "27", ...
## $ DIVISION <dbl> 9, 9, 7, 9, 9, 7, 9, 2, NA, 4, 1, 8, 2, 5, 6, 7, 3, 2, 3, ...
## $ TOTXEST <dbl> 15452, 11459, 15738, 25978, 588, 0, 0, 7261, 9406, -1414, ...
## $ CREDFINX <chr> "0", ".", "0", ".", "5", ".", ".", ".", ".", "0", ".", "0"...
## $ CREDITB <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ CREDITX <chr> "4000", "5000", "2000", ".", "7000", "1800", ".", "6000", ...
## $ BUILDING <chr> "01", "01", "01", "02", "08", "01", "01", "01", "01", "01"...
## $ ST_HOUS <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ INT_PHON <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ INT_HOME <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
ceColTypes <- ""
for (x in names(ce)) { ceColTypes <- paste0(ceColTypes, x, " ", class(ce[, x, drop=TRUE]), " _ ") }
all.equal(colTypes, ceColTypes)
## [1] "1 string mismatch"
# Construct a histogram of the weights
ggplot(data = ce, mapping = aes(x = FINLWT21)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# In the next few exercises we will practice specifying sampling designs using different samples from the api dataset, located in the survey package
# The api dataset contains the Academic Performance Index and demographic information for schools in California
# The apisrs dataset is a simple random sample of schools from the api dataset
# Notice that pw contains the survey weights and fpc contains the total number of schools in the population
data(api, package="survey")
library(survey)
## Loading required package: grid
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
# Look at the apisrs dataset
glimpse(apisrs)
## Observations: 200
## Variables: 39
## $ cds <chr> "15739081534155", "19642126066716", "30664493030640", "196...
## $ stype <fct> H, E, H, E, E, E, M, E, E, E, E, H, M, E, E, E, M, M, H, E...
## $ name <chr> "McFarland High", "Stowers (Cecil ", "Brea-Olinda Hig", "A...
## $ sname <chr> "McFarland High", "Stowers (Cecil B.) Elementary", "Brea-O...
## $ snum <dbl> 1039, 1124, 2868, 1273, 4926, 2463, 2031, 1736, 2142, 4754...
## $ dname <chr> "McFarland Unified", "ABC Unified", "Brea-Olinda Unified",...
## $ dnum <int> 432, 1, 79, 187, 640, 284, 401, 401, 470, 632, 401, 753, 7...
## $ cname <chr> "Kern", "Los Angeles", "Orange", "Los Angeles", "San Luis ...
## $ cnum <int> 14, 18, 29, 18, 39, 18, 18, 18, 18, 37, 18, 24, 14, 1, 47,...
## $ flag <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ pcttest <int> 98, 100, 98, 99, 99, 93, 98, 99, 100, 90, 95, 100, 97, 99,...
## $ api00 <int> 462, 878, 734, 772, 739, 835, 456, 506, 543, 649, 556, 671...
## $ api99 <int> 448, 831, 742, 657, 719, 822, 472, 474, 458, 604, 575, 620...
## $ target <int> 18, NA, 3, 7, 4, NA, 16, 16, 17, 10, 11, 9, 14, 5, 15, 10,...
## $ growth <int> 14, 47, -8, 115, 20, 13, -16, 32, 85, 45, -19, 51, 4, 51, ...
## $ sch.wide <fct> No, Yes, No, Yes, Yes, Yes, No, Yes, Yes, Yes, No, Yes, No...
## $ comp.imp <fct> Yes, Yes, No, Yes, Yes, Yes, No, Yes, Yes, No, No, Yes, No...
## $ both <fct> No, Yes, No, Yes, Yes, Yes, No, Yes, Yes, No, No, Yes, No,...
## $ awards <fct> No, Yes, No, Yes, Yes, No, No, Yes, Yes, No, No, Yes, No, ...
## $ meals <int> 44, 8, 10, 70, 43, 16, 81, 98, 94, 85, 81, 67, 77, 20, 70,...
## $ ell <int> 31, 25, 10, 25, 12, 19, 40, 65, 65, 57, 4, 25, 32, 16, 23,...
## $ yr.rnd <fct> NA, NA, NA, NA, NA, NA, NA, No, NA, NA, NA, NA, NA, NA, No...
## $ mobility <int> 6, 15, 7, 23, 12, 13, 22, 43, 15, 10, 20, 12, 4, 32, 17, 9...
## $ acs.k3 <int> NA, 19, NA, 23, 20, 19, NA, 18, 19, 16, 16, NA, NA, 19, 21...
## $ acs.46 <int> NA, 30, NA, NA, 29, 29, 30, 29, 32, 25, 27, NA, NA, 29, 30...
## $ acs.core <int> 24, NA, 28, NA, NA, NA, 27, NA, NA, 30, NA, 17, 27, NA, NA...
## $ pct.resp <int> 82, 97, 95, 100, 91, 71, 49, 75, 99, 49, 62, 96, 77, 96, 3...
## $ not.hsg <int> 44, 4, 5, 37, 8, 1, 30, 49, 48, 23, 5, 44, 40, 4, 14, 18, ...
## $ hsg <int> 34, 10, 9, 40, 21, 8, 27, 31, 34, 36, 38, 19, 34, 14, 57, ...
## $ some.col <int> 12, 23, 21, 14, 27, 20, 18, 15, 14, 14, 29, 17, 16, 25, 18...
## $ col.grad <int> 7, 43, 41, 8, 34, 38, 22, 2, 4, 21, 24, 19, 8, 37, 10, 23,...
## $ grad.sch <int> 3, 21, 24, 1, 10, 34, 2, 3, 1, 6, 5, 2, 2, 19, 1, 3, 10, 3...
## $ avg.ed <dbl> 1.91, 3.66, 3.71, 1.96, 3.17, 3.96, 2.39, 1.79, 1.77, 2.51...
## $ full <int> 71, 90, 83, 85, 100, 75, 72, 69, 68, 81, 84, 100, 89, 95, ...
## $ emer <int> 35, 10, 18, 18, 0, 20, 25, 22, 29, 7, 16, 0, 11, 5, 6, 10,...
## $ enroll <int> 477, 478, 1410, 342, 217, 258, 1274, 566, 645, 311, 328, 2...
## $ api.stu <int> 429, 420, 1287, 291, 189, 211, 1090, 353, 563, 258, 253, 1...
## $ pw <dbl> 30.97, 30.97, 30.97, 30.97, 30.97, 30.97, 30.97, 30.97, 30...
## $ fpc <dbl> 6194, 6194, 6194, 6194, 6194, 6194, 6194, 6194, 6194, 6194...
# Specify a simple random sampling for apisrs
apisrs_design <- svydesign(data = apisrs, weights = ~pw, fpc = ~fpc, id = ~1)
# Print a summary of the design
summary(apisrs_design)
## Independent Sampling design
## svydesign(data = apisrs, weights = ~pw, fpc = ~fpc, id = ~1)
## Probabilities:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03229 0.03229 0.03229 0.03229 0.03229 0.03229
## Population size (PSUs): 6194
## Data variables:
## [1] "cds" "stype" "name" "sname" "snum" "dname"
## [7] "dnum" "cname" "cnum" "flag" "pcttest" "api00"
## [13] "api99" "target" "growth" "sch.wide" "comp.imp" "both"
## [19] "awards" "meals" "ell" "yr.rnd" "mobility" "acs.k3"
## [25] "acs.46" "acs.core" "pct.resp" "not.hsg" "hsg" "some.col"
## [31] "col.grad" "grad.sch" "avg.ed" "full" "emer" "enroll"
## [37] "api.stu" "pw" "fpc"
# Now let's practice specifying a stratified sampling design, using the dataset apistrat
# The schools are stratified based on the school type stype where E = Elementary, M = Middle, and H = High School
# For each school type, a simple random sample of schools was taken
# Glimpse the data
glimpse(apistrat)
## Observations: 200
## Variables: 39
## $ cds <chr> "19647336097927", "19647336016018", "19648816021505", "196...
## $ stype <fct> E, E, E, E, E, E, E, E, E, E, M, M, H, M, H, E, E, M, M, E...
## $ name <chr> "Open Magnet: Ce", "Belvedere Eleme", "Altadena Elemen", "...
## $ sname <chr> "Open Magnet: Center for Individual (Char", "Belvedere Ele...
## $ snum <dbl> 2077, 1622, 2236, 1921, 6140, 6077, 6071, 904, 4637, 4311,...
## $ dname <chr> "Los Angeles Unified", "Los Angeles Unified", "Pasadena Un...
## $ dnum <int> 401, 401, 541, 401, 460, 689, 689, 41, 702, 135, 590, 767,...
## $ cname <chr> "Los Angeles", "Los Angeles", "Los Angeles", "Los Angeles"...
## $ cnum <int> 18, 18, 18, 18, 55, 55, 55, 14, 36, 36, 35, 32, 9, 1, 32, ...
## $ flag <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ pcttest <int> 99, 100, 99, 100, 100, 100, 99, 98, 100, 100, 99, 99, 93, ...
## $ api00 <int> 840, 516, 531, 501, 720, 805, 778, 731, 592, 669, 496, 505...
## $ api99 <int> 816, 476, 544, 457, 659, 780, 787, 731, 508, 658, 479, 499...
## $ target <int> NA, 16, 13, 17, 7, 1, 1, 3, 15, 7, 16, 15, 17, 20, 13, 18,...
## $ growth <int> 24, 40, -13, 44, 61, 25, -9, 0, 84, 11, 17, 6, 7, 3, -10, ...
## $ sch.wide <fct> Yes, Yes, No, Yes, Yes, Yes, No, No, Yes, Yes, Yes, No, No...
## $ comp.imp <fct> No, Yes, No, Yes, Yes, Yes, No, No, Yes, No, No, No, No, N...
## $ both <fct> No, Yes, No, Yes, Yes, Yes, No, No, Yes, No, No, No, No, N...
## $ awards <fct> No, Yes, No, Yes, Yes, Yes, No, No, Yes, No, No, No, No, N...
## $ meals <int> 33, 98, 64, 83, 26, 7, 9, 45, 75, 47, 69, 60, 66, 54, 35, ...
## $ ell <int> 25, 77, 23, 63, 17, 0, 2, 2, 58, 23, 25, 10, 43, 26, 7, 66...
## $ yr.rnd <fct> No, Yes, No, No, No, No, No, No, Yes, No, No, No, No, No, ...
## $ mobility <int> 11, 26, 17, 13, 31, 12, 10, 15, 23, 19, 26, 22, 16, 44, 18...
## $ acs.k3 <int> 20, 19, 20, 17, 20, 19, 19, 19, 20, 18, NA, NA, NA, NA, NA...
## $ acs.46 <int> 29, 28, 30, 30, 30, 29, 31, 31, 32, 29, 32, 32, NA, 32, NA...
## $ acs.core <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 30, 32, 27, 29, 28...
## $ pct.resp <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 87, 67, 50, 70, 71, 2, 91...
## $ not.hsg <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 31, 49, 12, 20, 45, 9, 22...
## $ hsg <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 34, 20, 33, 20, 36, 64, 2...
## $ some.col <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 22, 15, 23, 31, 11, 18, 3...
## $ col.grad <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 29, 23, 8, 9, 16,...
## $ grad.sch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 3, 6, 0, 0, 11, 0, ...
## $ avg.ed <dbl> 3.32, 1.67, 2.34, 1.86, 3.17, 3.64, 3.55, 3.10, 2.17, 2.82...
## $ full <int> 100, 57, 81, 64, 90, 95, 96, 93, 91, 96, 84, 65, 93, 55, 8...
## $ emer <int> 0, 40, 26, 24, 7, 0, 0, 8, 14, 0, 18, 37, 17, 26, 19, 33, ...
## $ enroll <int> 276, 841, 441, 298, 354, 330, 385, 583, 763, 381, 1293, 10...
## $ api.stu <int> 241, 631, 415, 288, 319, 315, 363, 510, 652, 322, 1035, 81...
## $ pw <dbl> 44.21, 44.21, 44.21, 44.21, 44.21, 44.21, 44.21, 44.21, 44...
## $ fpc <dbl> 4421, 4421, 4421, 4421, 4421, 4421, 4421, 4421, 4421, 4421...
# Summarize strata sample sizes
apistrat %>%
count(stype)
## # A tibble: 3 x 2
## stype n
## <fct> <int>
## 1 E 100
## 2 H 50
## 3 M 50
# Specify the design
strat_design <- svydesign(data = apistrat, weights = ~pw, fpc = ~fpc, id = ~1, strata = ~stype)
# Look at the summary information for the stratified design
summary(strat_design)
## Stratified Independent Sampling design
## svydesign(data = apistrat, weights = ~pw, fpc = ~fpc, id = ~1,
## strata = ~stype)
## Probabilities:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02262 0.02262 0.03587 0.04014 0.05339 0.06623
## Stratum Sizes:
## E H M
## obs 100 50 50
## design.PSU 100 50 50
## actual.PSU 100 50 50
## Population stratum sizes (PSUs):
## E H M
## 4421 755 1018
## Data variables:
## [1] "cds" "stype" "name" "sname" "snum" "dname"
## [7] "dnum" "cname" "cnum" "flag" "pcttest" "api00"
## [13] "api99" "target" "growth" "sch.wide" "comp.imp" "both"
## [19] "awards" "meals" "ell" "yr.rnd" "mobility" "acs.k3"
## [25] "acs.46" "acs.core" "pct.resp" "not.hsg" "hsg" "some.col"
## [31] "col.grad" "grad.sch" "avg.ed" "full" "emer" "enroll"
## [37] "api.stu" "pw" "fpc"
# Now let's practice specifying a cluster sampling design, using the dataset apiclus2
# The schools were clustered based on school districts, dnum
# Within a sampled school district, 5 schools were randomly selected for the sample
# The schools are denoted by snum
# The number of districts is given by fpc1 and the number of schools in the sampled districts is given by fpc2
# Glimpse the data
glimpse(apiclus2)
## Observations: 126
## Variables: 40
## $ cds <chr> "31667796031017", "55751846054837", "41688746043517", "416...
## $ stype <fct> E, E, E, M, E, E, E, E, M, H, E, M, E, E, E, E, H, E, E, M...
## $ name <chr> "Alta-Dutch Flat", "Tenaya Elementa", "Panorama Elemen", "...
## $ sname <chr> "Alta-Dutch Flat Elementary", "Tenaya Elementary", "Panora...
## $ snum <dbl> 3269, 5979, 4958, 4957, 4956, 4915, 2548, 2550, 2549, 348,...
## $ dname <chr> "Alta-Dutch Flat Elem", "Big Oak Flat-Grvlnd Unif", "Brisb...
## $ dnum <int> 15, 63, 83, 83, 83, 117, 132, 132, 132, 152, 152, 152, 173...
## $ cname <chr> "Placer", "Tuolumne", "San Mateo", "San Mateo", "San Mateo...
## $ cnum <int> 30, 54, 40, 40, 40, 39, 19, 19, 19, 5, 5, 5, 36, 36, 36, 3...
## $ flag <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ pcttest <int> 100, 100, 98, 100, 98, 100, 100, 100, 100, 96, 98, 100, 10...
## $ api00 <int> 821, 773, 600, 740, 716, 811, 472, 520, 568, 591, 544, 612...
## $ api99 <int> 785, 718, 632, 740, 711, 779, 432, 494, 589, 585, 554, 583...
## $ target <int> 1, 4, 8, 3, 4, 1, 18, 15, 11, 11, 12, 11, NA, NA, NA, NA, ...
## $ growth <int> 36, 55, -32, 0, 5, 32, 40, 26, -21, 6, -10, 29, 14, 2, 30,...
## $ sch.wide <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes, Yes...
## $ comp.imp <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes, Yes...
## $ both <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes, Yes...
## $ awards <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes, Yes...
## $ meals <int> 27, 43, 33, 11, 5, 25, 78, 76, 68, 42, 63, 54, 0, 4, 1, 6,...
## $ ell <int> 0, 0, 5, 4, 2, 5, 38, 34, 34, 23, 42, 24, 3, 6, 2, 1, 37, ...
## $ yr.rnd <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, No...
## $ mobility <int> 14, 12, 9, 8, 6, 19, 13, 13, 15, 4, 15, 15, 24, 19, 14, 14...
## $ acs.k3 <int> 17, 18, 19, NA, 18, 20, 19, 25, NA, NA, 20, NA, 19, 18, 19...
## $ acs.46 <int> 20, 34, 29, 30, 28, 22, NA, 23, 24, NA, NA, 27, 27, 25, 27...
## $ acs.core <int> NA, NA, NA, 24, NA, 31, NA, NA, 25, 21, NA, 18, NA, NA, NA...
## $ pct.resp <int> 89, 98, 79, 96, 98, 93, 100, 46, 91, 94, 93, 88, 90, 99, 0...
## $ not.hsg <int> 4, 8, 8, 5, 3, 5, 48, 30, 63, 20, 29, 27, 0, 1, 0, 1, 50, ...
## $ hsg <int> 16, 33, 28, 27, 14, 9, 32, 27, 16, 18, 32, 25, 0, 7, 0, 5,...
## $ some.col <int> 53, 37, 30, 35, 22, 30, 15, 21, 13, 27, 26, 24, 4, 8, 0, 8...
## $ col.grad <int> 21, 15, 32, 27, 58, 37, 4, 13, 6, 28, 7, 18, 51, 42, 0, 42...
## $ grad.sch <int> 6, 7, 1, 6, 3, 19, 1, 9, 2, 7, 6, 7, 44, 41, 100, 45, 1, 6...
## $ avg.ed <dbl> 3.07, 2.79, 2.90, 3.03, 3.44, 3.56, 1.77, 2.42, 1.68, 2.84...
## $ full <int> 100, 100, 100, 82, 100, 94, 96, 86, 75, 100, 100, 97, 100,...
## $ emer <int> 0, 0, 0, 18, 8, 6, 8, 24, 21, 4, 4, 3, 0, 4, 0, 4, 28, 18,...
## $ enroll <int> 152, 312, 173, 201, 147, 234, 184, 512, 543, 332, 217, 520...
## $ api.stu <int> 120, 270, 151, 179, 136, 189, 158, 419, 423, 303, 182, 438...
## $ pw <dbl> 18.925, 18.925, 18.925, 18.925, 18.925, 18.925, 18.925, 18...
## $ fpc1 <dbl> 757, 757, 757, 757, 757, 757, 757, 757, 757, 757, 757, 757...
## $ fpc2 <int> <array[26]>
# Specify the design
apiclus_design <- svydesign(id = ~dnum + snum, data = apiclus2, weights = ~pw, fpc = ~fpc1 + fpc2)
#Look at the summary information stored for both designs
summary(apiclus_design)
## 2 - level Cluster Sampling design
## With (40, 126) clusters.
## svydesign(id = ~dnum + snum, data = apiclus2, weights = ~pw,
## fpc = ~fpc1 + fpc2)
## Probabilities:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.003669 0.037743 0.052840 0.042390 0.052840 0.052840
## Population size (PSUs): 757
## Data variables:
## [1] "cds" "stype" "name" "sname" "snum" "dname"
## [7] "dnum" "cname" "cnum" "flag" "pcttest" "api00"
## [13] "api99" "target" "growth" "sch.wide" "comp.imp" "both"
## [19] "awards" "meals" "ell" "yr.rnd" "mobility" "acs.k3"
## [25] "acs.46" "acs.core" "pct.resp" "not.hsg" "hsg" "some.col"
## [31] "col.grad" "grad.sch" "avg.ed" "full" "emer" "enroll"
## [37] "api.stu" "pw" "fpc1" "fpc2"
# Construct histogram of pw
ggplot(data = apisrs, mapping = aes(x = pw)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Construct histogram of pw
ggplot(data = apistrat, mapping = aes(x = pw)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Construct histogram of pw
ggplot(data = apiclus2, mapping = aes(x = pw)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
NHANESraw <- read.csv("./RInputFiles/NHANESraw.txt")
NHANESraw <- NHANESraw %>%
mutate(WTMEC4YR=WTMEC2YR / 2)
names(NHANESraw)[1] <- "SurveyYr"
glimpse(NHANESraw)
## Observations: 20,293
## Variables: 78
## $ SurveyYr <fct> 2009_10, 2009_10, 2009_10, 2009_10, 2009_10, 20...
## $ ID <int> 51624, 51625, 51626, 51627, 51628, 51629, 51630...
## $ Gender <fct> male, male, male, male, female, male, female, f...
## $ Age <int> 34, 4, 16, 10, 60, 26, 49, 1, 10, 80, 10, 80, 4...
## $ AgeMonths <int> 409, 49, 202, 131, 722, 313, 596, 12, 124, NA, ...
## $ Race1 <fct> White, Other, Black, Black, Black, Mexican, Whi...
## $ Race3 <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ Education <fct> c_HighSchool, NA, NA, NA, c_HighSchool, b_9_11t...
## $ MaritalStatus <fct> Married, NA, NA, NA, Widowed, Married, LivePart...
## $ HHIncome <fct> 25000-34999, 20000-24999, 45000-54999, 20000-24...
## $ HHIncomeMid <int> 30000, 22500, 50000, 22500, 12500, 30000, 40000...
## $ Poverty <dbl> 1.36, 1.07, 2.27, 0.81, 0.69, 1.01, 1.91, 1.36,...
## $ HomeRooms <int> 6, 9, 5, 6, 6, 4, 5, 5, 7, 4, 5, 5, 7, NA, 6, 6...
## $ HomeOwn <fct> Own, Own, Own, Rent, Rent, Rent, Rent, Rent, Ow...
## $ Work <fct> NotWorking, NA, NotWorking, NA, NotWorking, Wor...
## $ Weight <dbl> 87.4, 17.0, 72.3, 39.8, 116.8, 97.6, 86.7, 9.4,...
## $ Length <dbl> NA, NA, NA, NA, NA, NA, NA, 75.7, NA, NA, NA, N...
## $ HeadCirc <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ Height <dbl> 164.7, 105.4, 181.3, 147.8, 166.0, 173.0, 168.4...
## $ BMI <dbl> 32.22, 15.30, 22.00, 18.22, 42.39, 32.61, 30.57...
## $ BMICatUnder20yrs <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ BMI_WHO <fct> 30.0_plus, 12.0_18.5, 18.5_to_24.9, 12.0_18.5, ...
## $ Pulse <int> 70, NA, 68, 68, 72, 72, 86, NA, 70, 88, 84, 54,...
## $ BPSysAve <int> 113, NA, 109, 93, 150, 104, 112, NA, 108, 139, ...
## $ BPDiaAve <int> 85, NA, 59, 41, 68, 49, 75, NA, 53, 43, 45, 60,...
## $ BPSys1 <int> 114, NA, 112, 92, 154, 102, 118, NA, 106, 142, ...
## $ BPDia1 <int> 88, NA, 62, 36, 70, 50, 82, NA, 60, 62, 38, 62,...
## $ BPSys2 <int> 114, NA, 114, 94, 150, 104, 108, NA, 106, 140, ...
## $ BPDia2 <int> 88, NA, 60, 44, 68, 48, 74, NA, 50, 46, 40, 62,...
## $ BPSys3 <int> 112, NA, 104, 92, 150, 104, 116, NA, 110, 138, ...
## $ BPDia3 <int> 82, NA, 58, 38, 68, 50, 76, NA, 56, 40, 50, 58,...
## $ Testosterone <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ DirectChol <dbl> 1.29, NA, 1.55, 1.89, 1.16, 1.16, 1.16, NA, 1.5...
## $ TotChol <dbl> 3.49, NA, 4.97, 4.16, 5.22, 4.14, 6.70, NA, 4.1...
## $ UrineVol1 <int> 352, NA, 281, 139, 30, 202, 77, NA, 39, 128, 10...
## $ UrineFlow1 <dbl> NA, NA, 0.415, 1.078, 0.476, 0.563, 0.094, NA, ...
## $ UrineVol2 <int> NA, NA, NA, NA, 246, NA, NA, NA, NA, NA, NA, NA...
## $ UrineFlow2 <dbl> NA, NA, NA, NA, 2.51, NA, NA, NA, NA, NA, NA, N...
## $ Diabetes <fct> No, No, No, No, Yes, No, No, No, No, No, No, Ye...
## $ DiabetesAge <int> NA, NA, NA, NA, 56, NA, NA, NA, NA, NA, NA, 70,...
## $ HealthGen <fct> Good, NA, Vgood, NA, Fair, Good, Good, NA, NA, ...
## $ DaysPhysHlthBad <int> 0, NA, 2, NA, 20, 2, 0, NA, NA, 0, NA, 0, NA, N...
## $ DaysMentHlthBad <int> 15, NA, 0, NA, 25, 14, 10, NA, NA, 0, NA, 0, NA...
## $ LittleInterest <fct> Most, NA, NA, NA, Most, None, Several, NA, NA, ...
## $ Depressed <fct> Several, NA, NA, NA, Most, Most, Several, NA, N...
## $ nPregnancies <int> NA, NA, NA, NA, 1, NA, 2, NA, NA, NA, NA, NA, N...
## $ nBabies <int> NA, NA, NA, NA, 1, NA, 2, NA, NA, NA, NA, NA, N...
## $ Age1stBaby <int> NA, NA, NA, NA, NA, NA, 27, NA, NA, NA, NA, NA,...
## $ SleepHrsNight <int> 4, NA, 8, NA, 4, 4, 8, NA, NA, 6, NA, 9, NA, 7,...
## $ SleepTrouble <fct> Yes, NA, No, NA, No, No, Yes, NA, NA, No, NA, N...
## $ PhysActive <fct> No, NA, Yes, NA, No, Yes, No, NA, NA, Yes, NA, ...
## $ PhyActiveDays <int> NA, NA, 5, NA, NA, 2, NA, NA, NA, 4, NA, NA, NA...
## $ TVHrsDay <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ TVHrsDay.1 <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ TVHrsDayChild <int> NA, 4, NA, 1, NA, NA, NA, NA, 1, NA, 3, NA, 2, ...
## $ ComputerHrsDayChild <int> NA, 1, NA, 1, NA, NA, NA, NA, 0, NA, 0, NA, 1, ...
## $ Alcohol12PlusYr <fct> Yes, NA, NA, NA, No, Yes, Yes, NA, NA, Yes, NA,...
## $ AlcoholDay <int> NA, NA, NA, NA, NA, 19, 2, NA, NA, 1, NA, NA, N...
## $ AlcoholYear <int> 0, NA, NA, NA, 0, 48, 20, NA, NA, 52, NA, 0, NA...
## $ SmokeNow <fct> No, NA, NA, NA, Yes, No, Yes, NA, NA, No, NA, N...
## $ Smoke100 <fct> Yes, NA, NA, NA, Yes, Yes, Yes, NA, NA, Yes, NA...
## $ SmokeAge <int> 18, NA, NA, NA, 16, 15, 38, NA, NA, 16, NA, 21,...
## $ Marijuana <fct> Yes, NA, NA, NA, NA, Yes, Yes, NA, NA, NA, NA, ...
## $ AgeFirstMarij <int> 17, NA, NA, NA, NA, 10, 18, NA, NA, NA, NA, NA,...
## $ RegularMarij <fct> No, NA, NA, NA, NA, Yes, No, NA, NA, NA, NA, NA...
## $ AgeRegMarij <int> NA, NA, NA, NA, NA, 12, NA, NA, NA, NA, NA, NA,...
## $ HardDrugs <fct> Yes, NA, NA, NA, No, Yes, Yes, NA, NA, NA, NA, ...
## $ SexEver <fct> Yes, NA, NA, NA, Yes, Yes, Yes, NA, NA, NA, NA,...
## $ SexAge <int> 16, NA, NA, NA, 15, 9, 12, NA, NA, NA, NA, NA, ...
## $ SexNumPartnLife <int> 8, NA, NA, NA, 4, 10, 10, NA, NA, NA, NA, NA, N...
## $ SexNumPartYear <int> 1, NA, NA, NA, NA, 1, 1, NA, NA, NA, NA, NA, NA...
## $ SameSex <fct> No, NA, NA, NA, No, No, Yes, NA, NA, NA, NA, NA...
## $ SexOrientation <fct> Heterosexual, NA, NA, NA, NA, Heterosexual, Het...
## $ WTINT2YR <dbl> 80100.544, 53901.104, 13953.078, 11664.899, 200...
## $ WTMEC2YR <dbl> 81528.772, 56995.035, 14509.279, 12041.635, 210...
## $ SDMVPSU <int> 1, 2, 1, 2, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1,...
## $ SDMVSTRA <int> 83, 79, 84, 86, 75, 88, 85, 86, 88, 77, 86, 79,...
## $ WTMEC4YR <dbl> 40764.386, 28497.518, 7254.639, 6020.818, 10500...
#Create table of average survey weights by race
tab_weights <- NHANESraw %>%
group_by(Race1) %>%
summarize(avg_wt = mean(WTMEC4YR))
#Print the table
tab_weights
## # A tibble: 5 x 2
## Race1 avg_wt
## <fct> <dbl>
## 1 Black 8026.
## 2 Hispanic 8579.
## 3 Mexican 8216.
## 4 Other 10116.
## 5 White 26236.
# The two important design variables in NHANESraw are SDMVSTRA, which contains the strata assignment for each unit, and SDMVPSU, which contains the cluster id within a given stratum
# Specify the NHANES design
NHANES_design <- svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU,
nest = TRUE, weights = ~WTMEC4YR
)
# Print summary of design
summary(NHANES_design)
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (62) clusters.
## svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU,
## nest = TRUE, weights = ~WTMEC4YR)
## Probabilities:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.986e-06 5.664e-05 1.054e-04 Inf 1.721e-04 Inf
## Stratum Sizes:
## 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
## obs 803 785 823 829 696 751 696 724 713 683 592 946 598 647 251 862 998
## design.PSU 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 3 3
## actual.PSU 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 3 3
## 92 93 94 95 96 97 98 99 100 101 102 103
## obs 875 602 688 722 676 608 708 682 700 715 624 296
## design.PSU 3 2 2 2 2 2 2 2 2 2 2 2
## actual.PSU 3 2 2 2 2 2 2 2 2 2 2 2
## Data variables:
## [1] "SurveyYr" "ID" "Gender"
## [4] "Age" "AgeMonths" "Race1"
## [7] "Race3" "Education" "MaritalStatus"
## [10] "HHIncome" "HHIncomeMid" "Poverty"
## [13] "HomeRooms" "HomeOwn" "Work"
## [16] "Weight" "Length" "HeadCirc"
## [19] "Height" "BMI" "BMICatUnder20yrs"
## [22] "BMI_WHO" "Pulse" "BPSysAve"
## [25] "BPDiaAve" "BPSys1" "BPDia1"
## [28] "BPSys2" "BPDia2" "BPSys3"
## [31] "BPDia3" "Testosterone" "DirectChol"
## [34] "TotChol" "UrineVol1" "UrineFlow1"
## [37] "UrineVol2" "UrineFlow2" "Diabetes"
## [40] "DiabetesAge" "HealthGen" "DaysPhysHlthBad"
## [43] "DaysMentHlthBad" "LittleInterest" "Depressed"
## [46] "nPregnancies" "nBabies" "Age1stBaby"
## [49] "SleepHrsNight" "SleepTrouble" "PhysActive"
## [52] "PhyActiveDays" "TVHrsDay" "TVHrsDay.1"
## [55] "TVHrsDayChild" "ComputerHrsDayChild" "Alcohol12PlusYr"
## [58] "AlcoholDay" "AlcoholYear" "SmokeNow"
## [61] "Smoke100" "SmokeAge" "Marijuana"
## [64] "AgeFirstMarij" "RegularMarij" "AgeRegMarij"
## [67] "HardDrugs" "SexEver" "SexAge"
## [70] "SexNumPartnLife" "SexNumPartYear" "SameSex"
## [73] "SexOrientation" "WTINT2YR" "WTMEC2YR"
## [76] "SDMVPSU" "SDMVSTRA" "WTMEC4YR"
# Number of clusters
NHANESraw %>%
summarize(n_clusters = n_distinct(SDMVSTRA, SDMVPSU))
## n_clusters
## 1 62
# Sample sizes in clusters
NHANESraw %>%
count(SDMVSTRA, SDMVPSU)
## # A tibble: 62 x 3
## SDMVSTRA SDMVPSU n
## <int> <int> <int>
## 1 75 1 379
## 2 75 2 424
## 3 76 1 419
## 4 76 2 366
## 5 77 1 441
## 6 77 2 382
## 7 78 1 378
## 8 78 2 451
## 9 79 1 349
## 10 79 2 347
## # ... with 52 more rows
Chapter 2 - Exploring categorical data
Visualizing categorical variables:
Exploring two categorical variables:
Inference for categorical variables:
Example code includes:
# Specify the survey design
NHANESraw <- mutate(NHANESraw, WTMEC4YR = .5 * WTMEC2YR)
NHANES_design <- svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU, nest = TRUE, weights = ~WTMEC4YR)
# Determine the levels of Depressed
levels(NHANESraw$Depressed)
## [1] "Most" "None" "Several"
# Construct a frequency table of Depressed
tab_w <- svytable(~Depressed, design = NHANES_design)
# Determine class of tab_w
class(tab_w)
## [1] "svytable" "xtabs" "table"
# Display tab_w
tab_w
## Depressed
## Most None Several
## 12704441 158758609 32732508
# Add proportions to table
tab_w <- tab_w %>%
as.data.frame() %>%
mutate(Prop = Freq/sum(Freq))
# Create a barplot
ggplot(data = tab_w, mapping = aes(x = Depressed, y = Prop)) +
geom_col()
# Construct and print a frequency table
tab_D <- svytable(~Depressed, design = NHANES_design)
tab_D
## Depressed
## Most None Several
## 12704441 158758609 32732508
# Construct and print a frequency table
tab_H <- svytable(~HealthGen, design = NHANES_design)
tab_H
## HealthGen
## Excellent Fair Good Poor Vgood
## 27659954 31544030 87497585 5668484 77482169
# Construct and print a frequency table
tab_DH <- svytable(~Depressed + HealthGen, design = NHANES_design)
tab_DH
## HealthGen
## Depressed Excellent Fair Good Poor Vgood
## Most 563613.3 3935505.6 4698948.1 1650509.5 1855864.8
## None 21327181.6 17690782.8 59920031.9 2324945.0 57487318.5
## Several 1870620.9 7355104.8 13950468.6 1253819.6 8302494.5
# Add conditional proportions to tab_DH
tab_DH_cond <- tab_DH %>%
as.data.frame() %>%
group_by(HealthGen) %>%
mutate(n_HealthGen = sum(Freq), Prop_Depressed = Freq/n_HealthGen) %>%
ungroup()
# Print tab_DH_cond
tab_DH_cond
## # A tibble: 15 x 5
## Depressed HealthGen Freq n_HealthGen Prop_Depressed
## <fct> <fct> <dbl> <dbl> <dbl>
## 1 Most Excellent 563613. 23761416. 0.0237
## 2 None Excellent 21327182. 23761416. 0.898
## 3 Several Excellent 1870621. 23761416. 0.0787
## 4 Most Fair 3935506. 28981393. 0.136
## 5 None Fair 17690783. 28981393. 0.610
## 6 Several Fair 7355105. 28981393. 0.254
## 7 Most Good 4698948. 78569449. 0.0598
## 8 None Good 59920032. 78569449. 0.763
## 9 Several Good 13950469. 78569449. 0.178
## 10 Most Poor 1650510. 5229274. 0.316
## 11 None Poor 2324945. 5229274. 0.445
## 12 Several Poor 1253820. 5229274. 0.240
## 13 Most Vgood 1855865. 67645678. 0.0274
## 14 None Vgood 57487318. 67645678. 0.850
## 15 Several Vgood 8302494. 67645678. 0.123
# Create a segmented bar graph of the conditional proportions in tab_DH_cond
ggplot(data = tab_DH_cond, mapping = aes(x = HealthGen, y = Prop_Depressed, fill = Depressed)) +
geom_col() +
coord_flip()
# We can also estimate counts with svytotal(). The syntax is given by:
# svytotal(x = ~interaction(Var1, Var2), design = design, na.rm = TRUE)
# For each combination of the two variables, we get an estimate of the total and the standard error
# Estimate the totals for combos of Depressed and HealthGen
tab_totals <- svytotal(x = ~interaction(Depressed, HealthGen), design = NHANES_design, na.rm = TRUE)
# Print table of totals
tab_totals
## total SE
## interaction(Depressed, HealthGen)Most.Excellent 563613 139689
## interaction(Depressed, HealthGen)None.Excellent 21327182 1556268
## interaction(Depressed, HealthGen)Several.Excellent 1870621 277198
## interaction(Depressed, HealthGen)Most.Fair 3935506 370256
## interaction(Depressed, HealthGen)None.Fair 17690783 1206307
## interaction(Depressed, HealthGen)Several.Fair 7355105 455364
## interaction(Depressed, HealthGen)Most.Good 4698948 501105
## interaction(Depressed, HealthGen)None.Good 59920032 3375068
## interaction(Depressed, HealthGen)Several.Good 13950469 931077
## interaction(Depressed, HealthGen)Most.Poor 1650510 195136
## interaction(Depressed, HealthGen)None.Poor 2324945 251934
## interaction(Depressed, HealthGen)Several.Poor 1253820 168440
## interaction(Depressed, HealthGen)Most.Vgood 1855865 269970
## interaction(Depressed, HealthGen)None.Vgood 57487319 2975806
## interaction(Depressed, HealthGen)Several.Vgood 8302495 687020
# Estimate the means for combos of Depressed and HealthGen
tab_means <- svymean(x = ~interaction(Depressed, HealthGen), design = NHANES_design, na.rm = TRUE)
# Print table of means
tab_means
## mean SE
## interaction(Depressed, HealthGen)Most.Excellent 0.0027603 0.0007
## interaction(Depressed, HealthGen)None.Excellent 0.1044492 0.0053
## interaction(Depressed, HealthGen)Several.Excellent 0.0091613 0.0014
## interaction(Depressed, HealthGen)Most.Fair 0.0192740 0.0019
## interaction(Depressed, HealthGen)None.Fair 0.0866400 0.0047
## interaction(Depressed, HealthGen)Several.Fair 0.0360214 0.0026
## interaction(Depressed, HealthGen)Most.Good 0.0230129 0.0023
## interaction(Depressed, HealthGen)None.Good 0.2934563 0.0092
## interaction(Depressed, HealthGen)Several.Good 0.0683220 0.0033
## interaction(Depressed, HealthGen)Most.Poor 0.0080833 0.0010
## interaction(Depressed, HealthGen)None.Poor 0.0113863 0.0013
## interaction(Depressed, HealthGen)Several.Poor 0.0061405 0.0009
## interaction(Depressed, HealthGen)Most.Vgood 0.0090890 0.0013
## interaction(Depressed, HealthGen)None.Vgood 0.2815422 0.0078
## interaction(Depressed, HealthGen)Several.Vgood 0.0406612 0.0028
# Run a chi square test between Depressed and HealthGen
svychisq(~Depressed + HealthGen, design = NHANES_design, statistic = "Chisq")
##
## Pearson's X^2: Rao & Scott adjustment
##
## data: svychisq(~Depressed + HealthGen, design = NHANES_design, statistic = "Chisq")
## X-squared = 1592.7, df = 8, p-value < 2.2e-16
# Construct a contingency table
tab <- svytable(~Education + HomeOwn, design=NHANES_design)
# Add conditional proportion of levels of HomeOwn for each educational level
tab_df <- as.data.frame(tab) %>%
group_by(Education) %>%
mutate(n_Education = sum(Freq), Prop_HomeOwn = Freq/n_Education) %>%
ungroup()
# Create a segmented bar graph
ggplot(data = tab_df, mapping = aes(x=Education, y=Prop_HomeOwn, fill=HomeOwn)) +
geom_col() +
coord_flip()
# Run a chi square test
svychisq(~Education + HomeOwn,
design = NHANES_design,
statistic = "Chisq")
##
## Pearson's X^2: Rao & Scott adjustment
##
## data: svychisq(~Education + HomeOwn, design = NHANES_design, statistic = "Chisq")
## X-squared = 531.78, df = 8, p-value = 2.669e-16
Chapter 3 - Exploring quantitative data
Summarizing quantitative data:
Visualizing quantitative data:
ggplot(mapping = aes(x = DaysPhysHlthBad, weight = WTMEC4YR_std)) + geom_density(bw = .6, fill = "lightblue") + labs(x = "Number of Bad Health Days in a Month") Inference for quantitative data:
Example code includes:
# Compute the survey-weighted mean
svymean(x = ~SleepHrsNight, design = NHANES_design, na.rm = TRUE)
## mean SE
## SleepHrsNight 6.9292 0.0166
# Compute the survey-weighted mean by Gender
svyby(formula = ~SleepHrsNight, by = ~Gender, design = NHANES_design,
FUN = svymean, na.rm = TRUE, keep.names = FALSE
)
## Gender SleepHrsNight se
## 1 female 6.976103 0.02374684
## 2 male 6.879050 0.01953263
# Compute the survey-weighted quantiles
svyquantile(x = ~SleepHrsNight, design = NHANES_design, na.rm = TRUE,
quantiles = c(0.01, 0.25, 0.5, 0.75, .99)
)
## 0.01 0.25 0.5 0.75 0.99
## SleepHrsNight 4 6 7 8 10
# Compute the survey-weighted quantiles by Gender
svyby(formula = ~SleepHrsNight, by = ~Gender, design = NHANES_design, FUN = svyquantile,
na.rm = TRUE, quantiles = c(0.5), keep.rows = FALSE, keep.var = FALSE
)
## Gender statistic
## female female 7
## male male 7
# Compute the survey-weighted mean by Gender
out <- svyby(formula = ~SleepHrsNight, by = ~Gender, design = NHANES_design,
FUN = svymean, na.rm = TRUE, keep.names = FALSE
)
# Construct a bar plot of average sleep by gender
ggplot(data = out, mapping = aes(x=as.factor(Gender), y=SleepHrsNight)) +
geom_col() +
labs(y="Average Nightly Sleep")
# Add lower and upper columns to out
out_col <- mutate(out, lower = SleepHrsNight - 2*se, upper = SleepHrsNight + 2*se)
# Construct a bar plot of average sleep by gender with error bars
ggplot(data = out_col, mapping = aes(x = Gender, y = SleepHrsNight, ymin = lower, ymax = upper)) +
geom_col(fill = "gold") +
labs(y = "Average Nightly Sleep") +
geom_errorbar(width = 0.7)
# Create a histogram with a set binwidth
ggplot(data = NHANESraw, mapping = aes(x=SleepHrsNight, weight=WTMEC4YR)) +
geom_histogram(binwidth = 1, color = "white") +
labs(x = "Hours of Sleep")
## Warning: Removed 7261 rows containing non-finite values (stat_bin).
# Create a histogram with a set binwidth
ggplot(data = NHANESraw, mapping = aes(x=SleepHrsNight, weight=WTMEC4YR)) +
geom_histogram(binwidth = 0.5, color = "white") +
labs(x = "Hours of Sleep")
## Warning: Removed 7261 rows containing non-finite values (stat_bin).
# Create a histogram with a set binwidth
ggplot(data = NHANESraw, mapping = aes(x=SleepHrsNight, weight=WTMEC4YR)) +
geom_histogram(binwidth = 2, color = "white") +
labs(x = "Hours of Sleep")
## Warning: Removed 7261 rows containing non-finite values (stat_bin).
# Density plot of sleep faceted by gender
NHANESraw %>%
filter(!is.na(SleepHrsNight), !is.na(Gender)) %>%
group_by(Gender) %>%
mutate(WTMEC4YR_std = WTMEC4YR/sum(WTMEC4YR)) %>%
ggplot(mapping = aes(x = SleepHrsNight, weight = WTMEC4YR_std)) +
geom_density(bw = 0.6, fill = "gold") +
labs(x = "Hours of Sleep") +
facet_wrap(~Gender, labeller = "label_both")
# Run a survey-weighted t-test
svyttest(formula = SleepHrsNight ~ Gender, design = NHANES_design)
## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion
## Warning in summary.glm(glm.object): observations with zero weight not used for
## calculating dispersion
##
## Design-based t-test
##
## data: SleepHrsNight ~ Gender
## t = -3.4077, df = 32, p-value = 0.001785
## alternative hypothesis: true difference in mean is not equal to 0
## 95 percent confidence interval:
## -0.15287218 -0.04123256
## sample estimates:
## difference in mean
## -0.09705237
# Find means of total cholesterol by whether or not active
out <- svyby(formula = ~TotChol, by = ~PhysActive, design = NHANES_design,
FUN = svymean, na.rm = TRUE, keep.names = FALSE
)
# Construct a bar plot of means of total cholesterol by whether or not active
ggplot(data = out, mapping = aes(x=PhysActive, y=TotChol)) +
geom_col()
# Run t test for difference in means of total cholesterol by whether or not active
svyttest(formula = TotChol ~ PhysActive, design = NHANES_design)
##
## Design-based t-test
##
## data: TotChol ~ PhysActive
## t = -3.7936, df = 32, p-value = 0.0006232
## alternative hypothesis: true difference in mean is not equal to 0
## 95 percent confidence interval:
## -0.20053677 -0.06390939
## sample estimates:
## difference in mean
## -0.1322231
Chapter 4 - Modeling quantitative data
Visualization with scatter plots:
Visualizing trends:
Modeling survey data:
More complex modeling:
Wrap up:
Example code includes:
# Create dataset with only 20 year olds
NHANES20 <- filter(NHANESraw, Age == 20)
# Construct scatter plot
ggplot(data = NHANES20, mapping = aes(x=Height, y=Weight)) +
geom_point(alpha = 0.3) +
guides(size = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
# Construct bubble plot
ggplot(data = NHANES20, mapping = aes(x=Height, y=Weight, size=WTMEC4YR)) +
geom_point(alpha = 0.3) +
guides(size = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
# Construct a scatter plot
ggplot(data = NHANES20, mapping = aes(x=Height, y=Weight, color=WTMEC4YR)) +
geom_point() +
guides(color = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
# Construct a scatter plot
ggplot(data = NHANES20, mapping = aes(x=Height, y=Weight, alpha=WTMEC4YR)) +
geom_point() +
guides(alpha = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
# Add gender to plot
ggplot(data = NHANES20, mapping = aes(x=Height, y=Weight, size=WTMEC4YR, color=Gender)) +
geom_point(alpha=0.3) +
guides(size = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
# Add gender to plot
ggplot(data = NHANES20, mapping = aes(x=Height, y=Weight, alpha=WTMEC4YR, color=Gender)) +
geom_point() +
guides(alpha = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
# Bubble plot with linear of best fit
ggplot(data = NHANESraw, mapping = aes(x = Height, y = Weight, size=WTMEC4YR)) +
geom_point(alpha = 0.1) +
guides(size = FALSE) +
geom_smooth(method = "lm", se = FALSE, mapping = aes(weight=WTMEC4YR))
## Warning: Removed 2279 rows containing non-finite values (stat_smooth).
## Warning: Removed 2279 rows containing missing values (geom_point).
# Add quadratic curve and cubic curve
ggplot(data = NHANESraw, mapping = aes(x = Height, y = Weight, size = WTMEC4YR)) +
geom_point(alpha = 0.1) +
guides(size = FALSE) +
geom_smooth(method = "lm", se = FALSE, mapping = aes(weight = WTMEC4YR)) +
geom_smooth(method = "lm", se = FALSE, mapping = aes(weight = WTMEC4YR), formula = y ~ poly(x, 2), color = "orange") +
geom_smooth(method = "lm", se = FALSE, mapping = aes(weight = WTMEC4YR), formula = y ~ poly(x, 3), color = "red")
## Warning: Removed 2279 rows containing non-finite values (stat_smooth).
## Warning: Removed 2279 rows containing non-finite values (stat_smooth).
## Warning: Removed 2279 rows containing non-finite values (stat_smooth).
## Warning: Removed 2279 rows containing missing values (geom_point).
# Add survey-weighted trend lines to bubble plot
ggplot(data = NHANES20, mapping = aes(x = Height, y = Weight, size = WTMEC4YR, color = Gender)) +
geom_point(alpha = 0.1) +
guides(size = FALSE) +
geom_smooth(method = "lm", se = FALSE, linetype = 2)
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).
# Add non-survey-weighted trend lines
ggplot(data = NHANES20, mapping = aes(x = Height, y = Weight, size = WTMEC4YR, color = Gender)) +
geom_point(alpha = 0.1) +
guides(size = FALSE) +
geom_smooth(method = "lm", se = FALSE, linetype = 2) +
geom_smooth(method = "lm", se = FALSE, mapping = aes(weight=WTMEC4YR))
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).
# Subset survey design object to only include 20 year olds
NHANES20_design <- subset(NHANES_design, Age == 20)
# Build a linear regression model
mod <- svyglm(Weight ~ Height, design = NHANES20_design)
# Print summary of the model
summary(mod)
##
## Call:
## svyglm(formula = Weight ~ Height, design = NHANES20_design)
##
## Survey design:
## subset(NHANES_design, Age == 20)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -67.2571 22.9836 -2.926 0.00674 **
## Height 0.8305 0.1368 6.072 1.51e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 326.6108)
##
## Number of Fisher Scoring iterations: 2
# Build a linear regression model same slope
mod1 <- svyglm(Weight ~ Height + Gender, design = NHANES20_design)
# Print summary of the same slope model
summary(mod1)
##
## Call:
## svyglm(formula = Weight ~ Height + Gender, design = NHANES20_design)
##
## Survey design:
## subset(NHANES_design, Age == 20)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -53.8665 22.7622 -2.366 0.0254 *
## Height 0.7434 0.1391 5.346 1.2e-05 ***
## Gendermale 2.7207 3.2471 0.838 0.4095
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 325.3881)
##
## Number of Fisher Scoring iterations: 2
# Build a linear regression model different slopes
mod2 <- svyglm(Weight ~ Height*Gender, design = NHANES20_design)
# Print summary of the different slopes model
summary(mod2)
##
## Call:
## svyglm(formula = Weight ~ Height * Gender, design = NHANES20_design)
##
## Survey design:
## subset(NHANES_design, Age == 20)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.5061 21.5357 0.441 0.66257
## Height 0.3565 0.1269 2.809 0.00932 **
## Gendermale -131.0884 41.9989 -3.121 0.00438 **
## Height:Gendermale 0.7897 0.2385 3.311 0.00273 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 316.5007)
##
## Number of Fisher Scoring iterations: 2
# Plot BPDiaAve and BPSysAve by Diabetes and include trend lines
drop_na(NHANESraw, Diabetes) %>%
ggplot(mapping = aes(x=BPDiaAve, y=BPSysAve, size=WTMEC4YR, color=Diabetes)) +
geom_point(alpha = 0.2) +
guides(size = FALSE) +
geom_smooth(method="lm", se = FALSE, mapping = aes(weight=WTMEC4YR))
## Warning: Removed 4600 rows containing non-finite values (stat_smooth).
## Warning: Removed 4600 rows containing missing values (geom_point).
# Build simple linear regression model
mod1 <- svyglm(BPSysAve ~ BPDiaAve, design = NHANES_design)
# Build model with different slopes
mod2 <- svyglm(BPSysAve ~ BPDiaAve*Diabetes, design = NHANES_design)
# Summarize models
summary(mod1)
##
## Call:
## svyglm(formula = BPSysAve ~ BPDiaAve, design = NHANES_design)
##
## Survey design:
## svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU,
## nest = TRUE, weights = ~WTMEC4YR)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85.74311 1.86920 45.87 <2e-16 ***
## BPDiaAve 0.48150 0.02354 20.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 290.3472)
##
## Number of Fisher Scoring iterations: 2
summary(mod2)
##
## Call:
## svyglm(formula = BPSysAve ~ BPDiaAve * Diabetes, design = NHANES_design)
##
## Survey design:
## svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU,
## nest = TRUE, weights = ~WTMEC4YR)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.58652 2.05537 40.667 < 2e-16 ***
## BPDiaAve 0.49964 0.02623 19.047 < 2e-16 ***
## DiabetesYes 25.36616 3.56587 7.114 6.53e-08 ***
## BPDiaAve:DiabetesYes -0.22132 0.05120 -4.323 0.000156 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 279.1637)
##
## Number of Fisher Scoring iterations: 2
Chapter 1 - Inference for a Single Parameter
General Social Survey:
CI interpretations:
Approximation shortcut:
Example code includes:
load("./RInputFiles/gss.RData")
glimpse(gss)
## Observations: 50,346
## Variables: 28
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,...
## $ year <dbl> 1982, 1982, 1982, 1982, 1982, 1982, 1982, 1982, 1982, 1982...
## $ age <fct> 41, 49, 27, 24, 57, 29, 21, 68, 54, 80, 74, 30, 53, 39, 36...
## $ class <fct> WORKING CLASS, WORKING CLASS, MIDDLE CLASS, MIDDLE CLASS, ...
## $ degree <fct> LT HIGH SCHOOL, HIGH SCHOOL, HIGH SCHOOL, HIGH SCHOOL, LT ...
## $ sex <fct> MALE, FEMALE, FEMALE, FEMALE, MALE, MALE, FEMALE, MALE, FE...
## $ marital <fct> MARRIED, MARRIED, NEVER MARRIED, NEVER MARRIED, NEVER MARR...
## $ race <fct> WHITE, WHITE, WHITE, WHITE, WHITE, WHITE, WHITE, WHITE, WH...
## $ region <fct> NEW ENGLAND, NEW ENGLAND, NEW ENGLAND, NEW ENGLAND, NEW EN...
## $ partyid <fct> "STRONG DEMOCRAT", "STRONG DEMOCRAT", "IND,NEAR DEM", "IND...
## $ happy <fct> PRETTY HAPPY, NOT TOO HAPPY, VERY HAPPY, PRETTY HAPPY, VER...
## $ grass <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ relig <fct> CATHOLIC, CATHOLIC, CATHOLIC, CATHOLIC, CATHOLIC, CATHOLIC...
## $ cappun2 <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ cappun <fct> FAVOR, FAVOR, FAVOR, OPPOSE, OPPOSE, FAVOR, OPPOSE, FAVOR,...
## $ finalter <fct> STAYED SAME, WORSE, BETTER, BETTER, STAYED SAME, BETTER, B...
## $ protest3 <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ natspac <fct> ABOUT RIGHT, TOO MUCH, TOO LITTLE, TOO LITTLE, ABOUT RIGHT...
## $ natarms <fct> TOO LITTLE, TOO LITTLE, ABOUT RIGHT, TOO MUCH, TOO LITTLE,...
## $ conclerg <fct> ONLY SOME, ONLY SOME, A GREAT DEAL, ONLY SOME, A GREAT DEA...
## $ confed <fct> ONLY SOME, ONLY SOME, ONLY SOME, ONLY SOME, A GREAT DEAL, ...
## $ conpress <fct> ONLY SOME, ONLY SOME, A GREAT DEAL, ONLY SOME, A GREAT DEA...
## $ conjudge <fct> HARDLY ANY, ONLY SOME, A GREAT DEAL, A GREAT DEAL, A GREAT...
## $ consci <fct> ONLY SOME, ONLY SOME, A GREAT DEAL, A GREAT DEAL, A GREAT ...
## $ conlegis <fct> ONLY SOME, ONLY SOME, ONLY SOME, ONLY SOME, A GREAT DEAL, ...
## $ zodiac <fct> TAURUS, CAPRICORN, VIRGO, PISCES, CAPRICORN, LEO, LIBRA, C...
## $ oversamp <dbl> 1.235, 1.235, 1.235, 1.235, 1.235, 1.235, 1.235, 1.235, 1....
## $ postlife <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
# Subset data from 2016
gss2016 <- gss %>%
filter(year == 2016)
gss2016 %>% count(consci)
## Warning: Factor `consci` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 4 x 2
## consci n
## <fct> <int>
## 1 A GREAT DEAL 791
## 2 ONLY SOME 976
## 3 HARDLY ANY 117
## 4 <NA> 983
gss2016 <- gss2016 %>%
mutate(old_consci=consci,
consci=fct_other(fct_recode(old_consci, "High"="A GREAT DEAL"), keep="High", other_level="Low")
)
gss2016 %>% count(consci)
## Warning: Factor `consci` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 3 x 2
## consci n
## <fct> <int>
## 1 High 791
## 2 Low 1093
## 3 <NA> 983
# Plot distribution of consci
ggplot(gss2016, aes(x = consci)) +
geom_bar()
# Compute proportion of high conf
p_hat <- gss2016 %>%
summarize(p = mean(consci == "High", na.rm = TRUE)) %>%
pull()
# Load the infer package
library(infer)
##
## Attaching package: 'infer'
## The following object is masked _by_ '.GlobalEnv':
##
## gss
# Create single bootstrap data set
b1 <- gss2016 %>%
specify(response = consci, success = "High") %>%
generate(reps = 1, type = "bootstrap")
## Warning: Removed 983 rows containing missing values.
# Plot distribution of consci
ggplot(b1, aes(x = consci)) +
geom_bar()
# Compute proportion with high conf
b1 %>%
summarize(p = mean(consci == "High")) %>%
pull()
## [1] 0.4187898
# Create bootstrap distribution for proportion that favor
boot_dist <- gss2016 %>%
specify(response = consci, success = "High") %>%
generate(reps = 500) %>%
calculate(stat = "prop", success = "High", na.rm = TRUE)
## Warning: Removed 983 rows containing missing values.
## Setting `type = "bootstrap"` in `generate()`.
# Plot distribution
ggplot(boot_dist, aes(x=stat)) +
geom_density()
# Compute estimate of SE
SE <- boot_dist %>%
summarize(se = sd(stat)) %>%
pull()
# Create CI
c(p_hat - 2*SE, p_hat + 2*SE)
## [1] 0.3964511 0.4432517
# Two new smaller data sets have been created for you from gss2016: gss2016_small, which contains 50 observations, and gss2016_smaller which contains just 10 observations
id50 <- c(6, 98, 2673, 1435, 1535, 525, 2784, 1765, 163, 1859, 2497, 1780, 184, 575, 2781, 2310, 1677, 2478, 1226, 2350, 1139, 1635, 1350, 1809, 1842, 1501, 1502, 2610, 2456, 49, 56, 2167, 2401, 2002, 2343, 2012, 860, 2557, 1147, 1119, 2449, 695, 1511, 666, 1595, 1094, 2643, 769, 1263, 2426)
id10 <- c(1609, 1342, 2066, 2710, 1809, 503, 1889, 486, 1469, 6)
gss2016_small <- gss2016 %>%
filter(id %in% id50)
gss2016_smaller <- gss2016 %>%
filter(id %in% id10)
# Create bootstrap distribution for proportion
boot_dist_small <- gss2016_small %>%
specify(response = consci, success = "High") %>%
generate(reps = 500, type = "bootstrap") %>%
calculate(stat = "prop")
# Compute estimate of SE
SE_small_n <- boot_dist_small %>%
summarize(se = sd(stat)) %>%
pull()
# Create bootstrap distribution for proportion
boot_dist_smaller <- gss2016_smaller %>%
specify(response = consci, success = "High") %>%
generate(reps = 500, type = "bootstrap") %>%
calculate(stat = "prop")
# Compute estimate of SE
SE_smaller_n <- boot_dist_smaller %>%
summarize(se = sd(stat)) %>%
pull()
c(SE_small_n, SE_smaller_n)
## [1] 0.07206823 0.14608464
# Create bootstrap distribution for proportion that have hardy any
boot_dist <- gss2016 %>%
specify(response=consci, success = "Low") %>%
generate(reps=500, type="bootstrap") %>%
calculate(stat = "prop", na.rm = TRUE)
## Warning: Removed 983 rows containing missing values.
# Compute estimate of SE
SE_low_p <- boot_dist %>%
summarize(se = sd(stat)) %>%
pull()
# Compute p-hat and n
p_hat <- gss2016_small %>%
summarize(p = mean(consci == "High", na.rm=TRUE)) %>%
pull()
n <- nrow(gss2016_small)
# Check conditions
p_hat * n >= 10
## [1] TRUE
(1 - p_hat) * n >= 10
## [1] TRUE
# Calculate SE
SE_approx <- sqrt(p_hat * (1 - p_hat) / n)
# Form 95% CI
c(p_hat - 2 * SE_approx, p_hat + 2 * SE_approx)
## [1] 0.242712 0.517288
Chapter 2 - Proportions (Testing and Power)
Hypothesis test for a proportion:
Intervals for differences:
Statistical errors:
Example code includes:
# Construct plot
ggplot(gss2016, aes(x = postlife)) +
geom_bar()
# Compute and save proportion that believe
p_hat <- gss2016 %>%
summarize(mean(postlife == "YES", na.rm = TRUE)) %>%
pull()
# Generate one data set under H0
sim1 <- gss2016 %>%
specify(response = postlife, success = "YES") %>%
hypothesize(null = "point", p = 0.75) %>%
generate(reps = 1, type = "simulate")
## Warning: Removed 279 rows containing missing values.
# Construct plot
ggplot(sim1, aes(x=postlife)) +
geom_bar()
# Compute proportion that believe
sim1 %>%
summarize(mean(postlife == "YES")) %>%
pull()
## [1] 0.7472952
# Generate null distribution
null <- gss2016 %>%
specify(response = postlife, success = "YES") %>%
hypothesize(null = "point", p = .75) %>%
generate(reps = 100, type = "simulate") %>%
calculate(stat = "prop")
## Warning: Removed 279 rows containing missing values.
# Visualize null distribution
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = p_hat, color = "red")
# Compute the two-tailed p-value
null %>%
summarize(mean(stat > p_hat)) %>%
pull() * 2
## [1] 0
# Plot distribution
ggplot(gss2016, aes(x = sex, fill = cappun)) +
geom_bar(position = "fill")
# Compute two proportions
p_hats <- gss2016 %>%
group_by(sex) %>%
summarize(mean(cappun == "FAVOR", na.rm = TRUE)) %>%
pull()
# Compute difference in proportions
d_hat <- diff(p_hats)
# Create null distribution
null <- gss2016 %>%
specify(cappun ~ sex, success = "FAVOR") %>%
hypothesize(null = "independence") %>%
generate(reps = 500, type = "permute") %>%
calculate(stat = "diff in props", order = c("FEMALE", "MALE"))
## Warning: Removed 172 rows containing missing values.
# Visualize null
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = d_hat, col = "red")
# Compute two-tailed p-value
null %>%
summarize(mean(stat < d_hat)) %>%
pull() * 2
## [1] 0
# Create the bootstrap distribution
boot <- gss2016 %>%
specify(cappun ~ sex, success="FAVOR") %>%
generate(reps=500, type="bootstrap") %>%
calculate(stat = "diff in props", order = c("FEMALE", "MALE"))
## Warning: Removed 172 rows containing missing values.
# Compute the standard error
SE <- boot %>%
summarize(sd(stat)) %>%
pull()
# Form the CI (lower, upper)
c( d_hat - 2*SE, d_hat + 2*SE )
## [1] -0.12636862 -0.05205316
gssmod <- gss2016 %>%
mutate(coinflip=sample(c("heads", "tails"), size=nrow(.), replace=TRUE))
table(gssmod$coinflip)
##
## heads tails
## 1434 1433
# Find difference in props
p_hats <- gssmod %>%
group_by(coinflip) %>%
summarize(mean(cappun == "FAVOR", na.rm = TRUE)) %>%
pull()
# Compute difference in proportions
d_hat <- diff(p_hats)
# Form null distribution
null <- gssmod %>%
specify(cappun ~ coinflip, success = "FAVOR") %>%
hypothesize(null = "independence") %>%
generate(reps = 500, type = "permute") %>%
calculate(stat = "diff in props", order = c("heads", "tails"))
## Warning: Removed 172 rows containing missing values.
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = d_hat, color = "red")
# Set alpha
alpha <- 0.05
# Find cutoffs
upper <- null %>%
summarize(quantile(stat, probs = c(1-alpha/2))) %>%
pull()
lower <- null %>%
summarize(quantile(stat, probs = alpha/2)) %>%
pull()
# Visualize cutoffs
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = d_hat, color = "red") +
geom_vline(xintercept = lower, color = "blue") +
geom_vline(xintercept = upper, color = "blue")
# check if inside cutoffs
d_hat %>%
between(lower, upper)
## [1] TRUE
Chapter 3 - Comparing Many Parameters (Independence)
Contingency tables:
Chi-squared test statistic:
Alternative method - chi-squared test statistic:
Intervals for chi-squared:
Example code includes:
# Exclude "other" party
gss_party <- gss2016 %>%
mutate(party=fct_collapse(partyid,
"D"=c("STRONG DEMOCRAT", "NOT STR DEMOCRAT"),
"R"=c("NOT STR REPUBLICAN", "STRONG REPUBLICAN"),
"I"=c("IND,NEAR DEM", "INDEPENDENT", "IND,NEAR REP"),
"O"="OTHER PARTY"
)
) %>%
filter(!is.na(party), party != "O") %>%
droplevels()
# Bar plot of proportions
gss_party %>%
ggplot(aes(x = party, fill = natspac)) +
geom_bar(position = "fill")
# Bar plot of counts
gss_party %>%
ggplot(aes(x=party, fill = natspac)) +
geom_bar()
# Create table of natspac and party
O <- gss_party %>%
select(natspac, party) %>%
table()
# Convert table back to tidy df
O %>%
broom::tidy() %>%
uncount(n)
## # A tibble: 1,249 x 2
## natspac party
## <chr> <chr>
## 1 TOO LITTLE D
## 2 TOO LITTLE D
## 3 TOO LITTLE D
## 4 TOO LITTLE D
## 5 TOO LITTLE D
## 6 TOO LITTLE D
## 7 TOO LITTLE D
## 8 TOO LITTLE D
## 9 TOO LITTLE D
## 10 TOO LITTLE D
## # ... with 1,239 more rows
# Create one permuted data set
perm_1 <- gss_party %>%
specify(natarms ~ party) %>%
hypothesize(null = "independence") %>%
generate(reps = 1, type = "permute")
## Warning: Removed 1412 rows containing missing values.
# Visualize permuted data
ggplot(perm_1, aes(x = party, fill = natarms)) +
geom_bar()
# Make contingency table
tab <- perm_1 %>%
ungroup() %>%
select(natarms, party) %>%
table()
# Compute chi-squared stat
(chi_obs_arms <- chisq.test(tab)$statistic)
## X-squared
## 1.34665
(chi_obs_spac <- chisq.test(gss_party$natspac, gss_party$party)$statistic)
## X-squared
## 7.568185
# Create null
null <- gss_party %>%
specify(natspac ~ party) %>%
hypothesize(null = "independence") %>%
generate(reps = 100, type = "permute") %>%
calculate(stat = "Chisq")
## Warning: Removed 1514 rows containing missing values.
# Visualize H_0 and obs
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = chi_obs_spac, color = "red")
# Create null
null <- gss_party %>%
specify(natarms ~ party) %>%
hypothesize(null = "independence") %>%
generate(reps = 100, type = "permute") %>%
calculate(stat = "Chisq")
## Warning: Removed 1412 rows containing missing values.
# Visualize H_0 and obs
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = chi_obs_arms, color = "red")
# create bar plot
gss2016 %>%
ggplot(aes(x = region, fill = happy)) +
geom_bar(position = "fill") +
coord_flip()
# create table
tab <- gss2016 %>%
select(happy, region) %>%
table()
# compute observed statistic
(chi_obs_stat <- chisq.test(tab)$statistic)
## X-squared
## 12.60899
# generate null distribution
null <- gss2016 %>%
mutate(happy=fct_other(happy, keep=c("VERY HAPPY"))) %>%
specify(happy ~ region, success = "VERY HAPPY") %>%
hypothesize(null = "independence") %>%
generate(reps = 500, type = "permute") %>%
calculate(stat = "Chisq")
## Warning: Removed 8 rows containing missing values.
# plot null(s)
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = chi_obs_stat) +
stat_function(fun = dchisq, args = list(df = (9-1)*(2-1)), color = "blue")
# permutation p-value
null %>%
summarize(mean(stat > chi_obs_stat)) %>%
pull()
## [1] 0.116
# approximation p-value
1 - pchisq(chi_obs_stat, df = (9-1)*(2-1))
## X-squared
## 0.1260301
Chapter 4 - Comparing Many Parameters (Goodness of Fit)
Case Study: Election Fraud:
Goodness of Fit:
Now to the US:
Wrap-Up:
Example code includes:
iran <- readr::read_csv("./RInputFiles/iran.csv")
## Parsed with column specification:
## cols(
## province = col_character(),
## city = col_character(),
## ahmadinejad = col_double(),
## rezai = col_double(),
## karrubi = col_double(),
## mousavi = col_double(),
## total_votes_cast = col_double(),
## voided_votes = col_double(),
## legitimate_votes = col_double()
## )
glimpse(iran)
## Observations: 366
## Variables: 9
## $ province <chr> "East Azerbaijan", "East Azerbaijan", "East Azerba...
## $ city <chr> "Azar Shahr", "Asko", "Ahar", "Bostan Abad", "Bona...
## $ ahmadinejad <dbl> 37203, 32510, 47938, 38610, 36395, 435728, 20520, ...
## $ rezai <dbl> 453, 481, 568, 281, 485, 9830, 166, 55, 442, 391, ...
## $ karrubi <dbl> 138, 468, 173, 53, 190, 3513, 74, 46, 211, 126, 17...
## $ mousavi <dbl> 18312, 18799, 26220, 12603, 33695, 419983, 14340, ...
## $ total_votes_cast <dbl> 56712, 52643, 75500, 51911, 71389, 876919, 35295, ...
## $ voided_votes <dbl> 606, 385, 601, 364, 624, 7865, 195, 102, 634, 661,...
## $ legitimate_votes <dbl> 56106, 52258, 74899, 51547, 70765, 869054, 35100, ...
# Compute candidate totals
totals <- iran %>%
summarize(ahmadinejad = sum(ahmadinejad),
rezai = sum(rezai),
karrubi = sum(karrubi),
mousavi = sum(mousavi))
# Plot totals
totals %>%
gather(key = "candidate", value = "votes") %>%
ggplot(aes(x = candidate, y = votes)) +
geom_bar(stat = "identity")
# Cities won by #2
iran %>%
group_by(province) %>%
summarize(ahmadinejad = sum(ahmadinejad),
mousavi = sum(mousavi)) %>%
mutate(mousavi_win = mousavi > ahmadinejad) %>%
filter(mousavi_win)
## # A tibble: 2 x 4
## province ahmadinejad mousavi mousavi_win
## <chr> <dbl> <dbl> <lgl>
## 1 Sistan and Baluchestan 450269 507946 TRUE
## 2 West Azerbaijan 623946 656508 TRUE
# Print get_first
get_first <- function(x) {
substr(as.character(x), 1, 1) %>%
as.numeric() %>%
as.factor()
}
# Create first_digit
iran2 <- iran %>%
mutate(first_digit = get_first(total_votes_cast))
# Construct barchart
iran2 %>%
ggplot(aes(x=first_digit)) +
geom_bar()
# Tabulate the counts of each digit
tab <- iran2 %>%
select(first_digit) %>%
table()
# Compute observed stat
p_benford <- c(0.301029995663981, 0.176091259055681, 0.1249387366083, 0.0969100130080564, 0.0791812460476248, 0.0669467896306132, 0.0579919469776867, 0.0511525224473813, 0.0457574905606751)
names(p_benford) <- 1:9
p_benford[9] <- 1 - sum(p_benford[-9])
sum(p_benford)
## [1] 1
chi_obs_stat <- chisq.test(tab, p = p_benford)$stat
# Form null distribution
null <- iran2 %>%
specify(response=first_digit) %>%
hypothesize(null = "point", p = p_benford) %>%
generate(reps=500, type = "simulate") %>%
calculate(stat = "Chisq")
# plot both nulls
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = chi_obs_stat) +
stat_function(fun = dchisq, args = list(df = 9-1), color = "blue")
# permutation p-value
null %>%
summarize(mean(stat > chi_obs_stat)) %>%
pull()
## [1] 0.006
# approximation p-value
pchisq(chi_obs_stat, df=9-1, lower.tail=FALSE)
## X-squared
## 0.006836367
iowa <- readr::read_csv("./RInputFiles/iowa.csv")
## Parsed with column specification:
## cols(
## office = col_character(),
## candidate = col_character(),
## party = col_character(),
## county = col_character(),
## votes = col_double()
## )
glimpse(iowa)
## Observations: 1,386
## Variables: 5
## $ office <chr> "President/Vice President", "President/Vice President", "...
## $ candidate <chr> "Evan McMullin / Nathan Johnson", "Under Votes", "Gary Jo...
## $ party <chr> "Nominated by Petition", NA, "Libertarian", NA, "Socialis...
## $ county <chr> "Adair", "Adair", "Adair", "Adair", "Adair", "Adair", "Ad...
## $ votes <dbl> 10, 32, 127, 5, 0, 10, 1133, 14, 3, 2461, 3848, 38, 5, 10...
# Get R+D county totals
iowa2 <- iowa %>%
filter(candidate == "Hillary Clinton / Tim Kaine" | candidate == "Donald Trump / Mike Pence") %>%
group_by(county) %>%
summarize(dem_rep_votes = sum(votes, na.rm = TRUE))
# Add first_digit
iowa3 <- iowa2 %>%
mutate(first_digit = get_first(dem_rep_votes))
# Construct bar plot
iowa3 %>%
ggplot(aes(x=first_digit)) +
geom_bar()
# Tabulate the counts of each digit
tab <- iowa3 %>%
select(first_digit) %>%
table()
# Compute observed stat
chi_obs_stat <- chisq.test(tab, p = p_benford)$stat
## Warning in chisq.test(tab, p = p_benford): Chi-squared approximation may be
## incorrect
# Form null distribution
null <- iowa3 %>%
specify(response = first_digit) %>%
hypothesize(null = "point", p = p_benford) %>%
generate(reps = 500, type = "simulate") %>%
calculate(stat = "Chisq")
# Visualize null
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = chi_obs_stat)
Chapter 1 - Dashboard Layouts
Introduction:
Anatomy of flexdashboard:
Layout basics:
Advanced layouts:
Example code includes (not added due to need for separate dashboard file):
Chapter 2 - Data Visualization for Dashboards
Graphs:
Web-Friendly Visualizations:
htmlwidgets:
Example code includes (not added due to need for separate dashboard file):
Chapter 3 - Dashboard Components
Highlighting Single Values:
Dashboard Tables:
Text for Dashboards:
Example code includes (not added due to need for separate dashboard file):
Chapter 4 - Adding Interactivity with Shiny
Incorporating Shiny into Dashboards:
Reactive Dataframe Pattern:
Customized Inputs for Charts:
Wrap-up:
Example code includes (not added due to need for separate dashboard file):
Chapter 1 - Exploring Graphs Through Time
Exploring Data Set:
Exploring Temporal Structure:
Example code includes:
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
amzn_g <- read.graph("./RInputFiles/amzn_g.gml", format=c("gml"))
amzn_g
## IGRAPH 441da5d DN-- 10245 10754 --
## + attr: id (v/n), name (v/c)
## + edges from 441da5d (vertex names):
## [1] 44 ->42 179 ->71 410 ->730 415 ->741 656 ->1267 669 ->672
## [7] 672 ->669 689 ->690 689 ->1284 690 ->689 690 ->1284 730 ->410
## [13] 741 ->909 786 ->1767 802 ->806 806 ->802 856 ->205 857 ->211
## [19] 867 ->866 868 ->866 909 ->741 911 ->748 921 ->190 1015->151
## [25] 1016->1015 1047->1049 1049->1047 1204->1491 1267->656 1272->669
## [31] 1278->152 1282->943 1284->689 1285->1286 1286->1285 1290->1293
## [37] 1293->1290 1293->1606 1294->1295 1295->1294 1312->730 1350->2783
## [43] 1362->156 1366->190 1438->1580 1438->1581 1467->3996 1479->158
## + ... omitted several edges
# Perform dyad census
dc <- dyad_census(amzn_g)
# Perform triad census
tc <- triad_census(amzn_g)
# Find the edge density
ed <- edge_density(amzn_g)
# Output values
print(dc)
## $mut
## [1] 3199
##
## $asym
## [1] 4356
##
## $null
## [1] 52467335
print(tc)
## [1] 179089386743 44610360 32763436 215 1906
## [6] 507 1198 457 118 0
## [11] 301 170 119 33 239
## [16] 288
print(ed)
## [1] 0.0001024681
# Calculate transitivity
transitivity(amzn_g)
## [1] 0.3875752
# Calculate reciprocity
amzn_rp <- reciprocity(amzn_g)
# Simulate our outputs
nv <- gorder(amzn_g)
ed <- edge_density(amzn_g)
rep_sim <- rep(NA, 1000)
# Simulate
for(i in 1:1000){
rep_sim[i] <- reciprocity(erdos.renyi.game(nv, ed, "gnp", directed = TRUE))
}
# Compare
quantile(rep_sim, c(0.25, .5, 0.975))
## 25% 50% 97.5%
## 0.0000000000 0.0000000000 0.0005504297
print(amzn_rp)
## [1] 0.5949414
# Get the distribution of in and out degrees
table(degree(amzn_g, mode = "in"))
##
## 0 1 2 3 4 5 6 7 8 9 11 12 17
## 2798 5240 1549 424 139 50 20 7 9 5 1 2 1
table(degree(amzn_g, mode = "out"))
##
## 0 1 2 3 4 5
## 1899 6350 1635 313 45 3
# Find important products based on the ratio of out to in and look for extremes
imp_prod <- V(amzn_g)[degree(amzn_g, mode = "out") > 3 & degree(amzn_g, mode = "in") < 3]
## Output the vertices
print(imp_prod)
## + 8/10245 vertices, named, from 441da5d:
## [1] 1629 4545 6334 20181 62482 64344 155513 221085
ipFrom <- c(1629, 1629, 1629, 1629, 1629, 1629, 1629, 1629, 1629, 1629, 1629, 11163, 11163, 11163, 11163, 11163, 11163, 11163, 11163, 11163, 11163, 11163, 32129, 32129, 32129, 32129, 32129, 32129, 32129, 38131, 38131, 38131, 38131, 38131, 38131, 45282, 45282, 45282, 45282, 52831, 52831, 52831, 52831, 52831, 52831, 52831, 52831, 53591, 53591, 53591, 53591, 53591, 53591, 53591, 53591, 56427, 56427, 56427, 56427, 59706, 59706, 59706, 59706, 59706, 59706, 59706, 59706, 62482, 62482, 62482, 62482, 62482, 62482, 67038, 67038, 67038, 67038, 71192, 71192, 71192, 71192, 71192, 77957, 77957, 77957, 77957, 77957, 77957, 103733, 103733, 103733, 103733, 103733, 117841, 117841, 117841, 117841, 117841, 117841, 117841, 117841, 117841, 117841, 123808, 123808, 123808, 123808, 123808, 123808, 123808, 123808, 123808, 132757, 132757, 132757, 132757, 132757, 132757, 132757, 132757, 132757, 132757, 132757, 132757, 132757, 132757, 144749, 144749, 144749, 144749, 144749, 144749, 144749, 170830, 170830, 170830, 170830, 170830, 170830, 177282, 177282, 177282, 177282, 177282, 177282, 177432, 177432, 177432, 177432, 177432, 177432, 177432, 184526, 184526, 184526, 184526, 184526, 191825, 191825, 191825, 191825, 191825, 215668, 215668, 215668, 221085, 221085, 221085, 221085, 221085, 231604, 231604, 231604, 231604, 231604, 231604, 239014, 239014, 239014, 239014, 239014, 242693, 242693, 242693, 242693, 242693, 257621, 257621, 257621, 257621, 261587, 261587, 261587, 261587, 261587, 261587, 261657, 261657, 261657, 261657, 261657, 261657)
ipTo <- c(190, 1366, 2679, 4023, 1625, 1627, 7529, 1272, 1628, 1630, 1631, 11124, 15360, 20175, 10626, 20970, 10776, 11164, 11166, 5955, 8719, 11164, 23842, 23843, 24115, 15312, 23329, 32127, 80473, 44848, 44849, 44850, 38133, 31084, 33711, 10920, 20178, 20179, 87093, 2134, 2136, 4119, 9995, 36524, 64698, 64700, 52833, 120083, 120085, 120086, 36689, 12340, 113789, 32094, 51015, 1898, 10076, 15800, 61488, 63836, 63837, 63838, 8882, 59708, 59711, 26982, 59708, 69497, 69498, 69499, 69500, 23349, 62480, 58926, 58928, 64118, 52271, 71190, 71380, 75384, 9762, 57876, 43543, 43546, 98488, 77951, 77953, 116842, 103732, 103734, 103735, 103728, 124733, 117842, 117843, 117845, 117842, 117843, 117845, 117842, 117842, 117843, 117845, 59267, 89503, 89506, 156, 190, 105428, 184973, 195785, 195787, 132753, 132754, 132755, 52563, 132755, 132756, 132759, 132762, 126757, 132754, 132755, 132756, 189269, 265886, 43155, 80519, 159667, 82479, 152760, 136747, 65216, 114684, 114686, 114687, 117132, 132667, 81755, 109198, 109199, 109202, 144124, 75023, 216449, 139527, 149146, 152038, 177428, 177430, 177428, 177430, 56930, 61658, 207112, 250755, 250756, 56930, 141148, 191036, 147084, 245110, 175959, 177376, 177377, 88463, 103641, 115111, 165118, 228427, 43553, 76706, 78278, 131353, 75725, 119146, 12615, 15740, 229533, 151325, 237568, 239545, 239546, 239547, 110872, 215593, 60310, 60312, 133398, 44502, 261582, 261590, 261599, 271593, 261584, 261588, 261649, 261653, 261654, 261658, 261662, 105814)
ipGroupFrom <- factor(c('DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'Video', 'Video', 'Video', 'Video', 'Video', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'Video', 'Video', 'Video', 'Video', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD'), levels=c("DVD", "Video"))
ipSRFrom <- c(30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 37, 37, 37, 37, 37, 37, 26, 26, 26, 26, 14, 14, 14, 14, 14, 14, 14, 14, 16, 16, 16, 16, 16, 16, 16, 16, 10, 10, 10, 10, 1, 1, 1, 1, 1, 1, 1, 1, 19, 19, 19, 19, 19, 19, 10, 10, 10, 10, 5, 5, 5, 5, 5, 3, 3, 3, 3, 3, 3, 17, 17, 17, 17, 17, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 27, 27, 27, 27, 27, 27, 27, 10, 10, 10, 10, 10, 10, 6, 6, 6, 6, 6, 6, 19, 19, 19, 19, 19, 19, 19, 25, 25, 25, 25, 25, 3, 3, 3, 3, 3, 8, 8, 8, 27, 27, 27, 27, 27, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 26, 26, 26, 26, 26, 15, 15, 15, 15, 8, 8, 8, 8, 8, 8, 26, 26, 26, 26, 26, 26)
ipSRTo <- c(5, 2, 18, 20, 12, 6, 8, 14, 16, 4, 18, 20, 3, 6, 14, 5, 3, 3, 4, 3, 13, 3, 5, 9, 18, 17, 8, 2, 8, 9, 16, 9, 24, 11, 25, 6, 9, 3, 21, 1, 5, 2, 24, 2, 6, 6, 8, 18, 7, 4, 20, 6, 22, 13, 10, 19, 4, 22, 7, 7, 9, 7, 11, 21, 12, 17, 21, 5, 7, 2, 1, 26, 6, 14, 2, 17, 4, 13, 12, 6, 8, 13, 4, 7, 1, 7, 9, 15, 19, 6, 20, 0, 19, 14, 18, 11, 14, 18, 11, 14, 14, 18, 11, 16, 1, 5, 3, 5, 6, 22, 5, 20, 10, 29, 9, 22, 9, 12, 10, 9, 12, 29, 9, 12, 13, 6, 23, 6, 18, 10, 18, 6, 9, 11, 8, 8, 19, 12, 10, 9, 8, 14, 1, 7, 10, 13, 18, 6, 6, 4, 6, 4, 4, 22, 5, 8, 4, 4, 13, 11, 3, 4, 21, 22, 8, 18, 1, 6, 5, 5, 4, 8, 6, 12, 6, 3, 13, 8, 10, 1, 1, 22, 12, 18, 19, 5, 18, 31, 8, 13, 10, 14, 25, 4, 19, 17, 5, 21, 3, 1, 19, 10)
ipTRFrom <- c(290, 290, 290, 290, 290, 290, 290, 290, 290, 290, 290, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 73, 73, 73, 73, 73, 73, 73, 294, 294, 294, 294, 294, 294, 43, 43, 43, 43, 5, 5, 5, 5, 5, 5, 5, 5, 13, 13, 13, 13, 13, 13, 13, 13, 28, 28, 28, 28, 1, 1, 1, 1, 1, 1, 1, 1, 110, 110, 110, 110, 110, 110, 7, 7, 7, 7, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 25, 25, 25, 25, 25, 25, 25, 2, 2, 2, 2, 2, 2, 12, 12, 12, 12, 12, 12, 111, 111, 111, 111, 111, 111, 111, 294, 294, 294, 294, 294, 0, 0, 0, 0, 0, 0, 0, 0, 243, 243, 243, 243, 243, 43, 43, 43, 43, 43, 43, 15, 15, 15, 15, 15, 483, 483, 483, 483, 483, 1, 1, 1, 1, 12, 12, 12, 12, 12, 12, 2, 2, 2, 2, 2, 2)
ipTRTo <- c(19, 2, 22, 105, 22, 1, 6, 55, 40, 21, 47, 13, 0, 42, 14, 51, 2, 4, 0, 2, 41, 4, 0, 19, 21, 63, 5, 0, 2, 4, 63, 63, 7, 1, 8, 11, 134, 134, 12, 5, 10, 3, 58, 1, 6, 2, 27, 39, 2, 18, 87, 12, 218, 2, 30, 17, 0, 41, 13, 9, 3, 2, 13, 8, 10, 1, 8, 1, 0, 7, 1, 167, 63, 28, 0, 6, 1, 10, 4, 0, 2, 0, 5, 2, 3, 2, 2, 12, 24, 45, 21, 0, 8, 2, 21, 20, 2, 21, 20, 2, 2, 21, 20, 14, 6, 6, 3, 19, 13, 88, 4, 9, 6, 0, 19, 54, 19, 6, 9, 1, 2, 0, 19, 6, 3, 13, 46, 29, 6, 1, 15, 1, 4, 18, 28, 5, 15, 21, 10, 12, 3, 5, 4, 3, 8, 5, 0, 0, 5, 0, 5, 0, 1, 221, 1, 13, 3, 1, 7, 40, 5, 0, 8, 37, 67, 48, 0, 6, 1, 25, 1, 69, 0, 55, 3, 0, 5, 5, 2, 13, 0, 44, 53, 9, 4, 5, 13, 212, 3, 3, 1, 3, 8, 0, 3, 12, 11, 10, 5, 0, 49, 42)
ipTitleFrom <- c(16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 13, 13, 13, 13, 13, 13, 13, 30, 30, 30, 30, 30, 30, 11, 11, 11, 11, 26, 26, 26, 26, 26, 26, 26, 26, 5, 5, 5, 5, 5, 5, 5, 5, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 18, 18, 18, 18, 18, 18, 25, 25, 25, 25, 14, 14, 14, 14, 14, 12, 12, 12, 12, 12, 12, 21, 21, 21, 21, 21, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27, 27, 27, 27, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 29, 29, 29, 29, 29, 29, 29, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 15, 15, 15, 15, 15, 15, 15, 30, 30, 30, 30, 30, 7, 7, 7, 7, 7, 6, 6, 6, 9, 9, 9, 9, 9, 17, 17, 17, 17, 17, 17, 10, 10, 10, 10, 10, 24, 24, 24, 24, 24, 19, 19, 19, 19, 3, 3, 3, 3, 3, 3, 28, 28, 28, 28, 28, 28)
ipNames <- c('Attraction', 'Barbara The Fair With The Silken Hair', 'Cannibal Apocalypse', "DJ Qbert's Wave Twisters", 'David and Lisa', 'Def Comedy Jam Vol. 13', 'Detroit Lions 2001 NFL Team Video', 'Donnie McClurkin: Live in London and More', 'El Hombre Sin Sombra (Hollow Man)', 'Gladiator', 'Kindergarten Cop', "Kingsley's Meadow - Wise Guy", "Lady & The Tramp II - Scamp's Adventure", 'Lojong - Transforming the Mind (Boxed Set)', 'Menace II Society', 'Merlin', 'Modern Times', 'Murder by Numbers (Full Screen Edition)', 'Nancy Drew: A Haunting We Will Go', 'Princess Nine - Triple Play (Vol. 3)', 'Secret Agent AKA Danger Man Set 2', 'Seguire Tus Pasos', 'Selena Remembered', 'Seven (New Line Platinum Series)', 'Sheba Baby', 'Slaughter', 'The Complete Guide to Medicine Ball Training', 'The Gambler', 'The Getaway', 'The Sum of All Fears')
ip_df <- data.frame(X=1:202,
from=ipFrom,
to=ipTo,
salesrank.from=ipSRFrom,
salesrank.to=ipSRTo,
totalreviews.from=ipTRFrom,
totalreviews.to=ipTRTo,
group.from=ipGroupFrom,
title.from=factor(ipNames[ipTitleFrom], levels=ipNames)
)
# Create a new graph
ip_g <- graph_from_data_frame(ip_df %>% select(from, to), directed = TRUE)
# Add color to the edges based on sales rank, blue is higer to lower, red is lower to higher
E(ip_g)$rank_flag <- ifelse(ip_df$salesrank.from <= ip_df$salesrank.to, "blue", "red")
# Plot and add a legend
plot(ip_g, vertex.label = NA, edge.arrow.width = 1, edge.arrow.size = 0,
edge.width = 4, margin = 0, vertex.size = 4,
edge.color = E(ip_g)$rank_flag, vertex.color = "black" )
legend("bottomleft", legend = c("Lower to Higher Rank", "Higher to Lower Rank"),
fill = unique(E(ip_g)$rank_flag ), cex = .7)
# Get a count of out degrees for all vertices
# deg_ct <- lapply(time_graph, function(x){return(degree(x, mode = "out") )})
# Create a dataframe starting by adding the degree count
# deg_df <- data.frame(ct = unlist(deg_ct))
# Add a column with the vertex names
# deg_df$vertex_name <- names(unlist(deg_ct))
# Add a time stamp
# deg_df$date <- ymd(rep(d, unlist(lapply(time_graph, function(x){length(V(x))}))))
# See all the vertices that have more than three out degrees
# lapply(time_graph, function(x){return(V(x)[degree(x, mode = "out") > 3])})
# Create a dataframe to plot of three important vertices
# vert_df <- deg_df %>% filter(vertex_name %in% c(1629, 132757, 117841))
# Draw the plot to see how they change through time
# ggplot(vert_df, aes(x = date, y = ct, group = vertex_name, colour = vertex_name)) + geom_path()
# Calculate clustering and reciprocity metrics
# trans <- unlist(lapply(all_graphs, FUN=transitivity))
# rp <- unlist(lapply(all_graphs, FUN=reciprocity))
# Create daaframe for plotting
# met_df <- data.frame("metric" = c(trans, rp))
# Repeat the data
# met_df$date <- rep(ymd(d), 2)
# Sort and then Repeat the metric labels
# met_df$name <- sort(rep(c("clustering", "reciprocity"), 4))
# Plot
# ggplot(met_df, aes(x= date, y= metric, group = name, colour = name)) + geom_path()
Chapter 2 - Talk About R on Twitter
Creating retweet graphs:
rt_name <- find_rt(raw_tweets$tweet_text[i]) if(!is.null(rt_name)){ if(!rt_name %in% all_sn){ rt_g <- rt_g + vertices(rt_name) } rt_g <- rt_g + edges(c(raw_tweets$screen_name[i], rt_name)) } Building mentions graphs:
ment_name <- mention_ext(raw_tweets$tweet_text[i]) if(length(ment_name) > 0 ) { for(j in ment_name) { if(!j %in% all_sn) { ment_g <- ment_g + vertices(j) } ment_g <- ment_g + edges(c(raw_tweets$screen_name[i], j)) } } Finding communities:
coords = layout_with_fr(ment_sg), margin = 0, vertex.size = 6, vertex.color = as.numeric(as.factor(V(eigen_sg)$eigen)) ) Example code includes:
rt_g <- read.graph("./RInputFiles/rt_g.gml", format=c("gml"))
rt_g
## IGRAPH 096a106 DN-- 4118 6052 --
## + attr: id (v/n), name (v/c)
## + edges from 096a106 (vertex names):
## [1] thinkR_fr ->thw_ch thinkR_fr ->omarwagih
## [3] KJMillidine ->Rbloggers earino ->d4tagirl
## [5] ReecheshJC ->KirkDBorne SCMansbridge ->rstudiotips
## [7] DeepSingularity->gp_pulipaka chrisderv ->thinkR_fr
## [9] chrisderv ->_ColinFay chrisderv ->joshua_ulrich
## [11] mtrost2 ->romain_francois mtrost2 ->rstudiotips
## [13] mtrost2 ->RLangTip dani_sola ->rstudiotips
## [15] hrhotz ->rstudiotips hrhotz ->cboettig
## + ... omitted several edges
# Calculate the number of nodes
gsize(rt_g)
## [1] 6052
# Calculate the number of edges
gorder(rt_g)
## [1] 4118
# Calculate the density
graph.density(rt_g)
## [1] 0.00035697
# Create the plot
plot(rt_g, vertex.label = NA, edge.arrow.width = .8, edge.arrow.size = 0.4, vertex.size = 3)
# Set the default color to black
V(rt_g)$color <- "black"
# Set the color of nodes that were retweeted just once to blue
V(rt_g)[degree(rt_g, mode = "in") == 1]$color <- "blue"
# Set the color of nodes that were retweeters just once to green
V(rt_g)[degree(rt_g, mode = "out") == 1 ]$color <- "green"
# Plot the network
plot(rt_g, vertex.label = NA, edge.arrow.width = .8,
edge.arrow.size = 0.25,
vertex.size = 4, vertex.color = V(rt_g)$color)
# Set the default color to black
V(rt_g)$color <- "black"
# Set the color of nodes that were retweeted just once to blue
V(rt_g)[degree(rt_g, mode = "in") == 1 & degree(rt_g, mode = "out") == 0]$color <- "blue"
# Set the color of nodes that were retweeters just once to green
V(rt_g)[degree(rt_g, mode = "in") == 0 & degree(rt_g, mode = "out") == 1 ]$color <- "green"
# Plot the network
plot(rt_g, vertex.label = NA, edge.arrow.width = .8,
edge.arrow.size = 0.25,
vertex.size = 4, vertex.color = V(rt_g)$color)
# Calculate betweenness
rt_btw <- igraph::betweenness(rt_g, directed = TRUE)
# Plot histogram
hist(rt_btw, breaks = 2000, xlim = c(0, 1000), main = "Betweenness")
# Calculate eigen centrality
rt_ec <- eigen_centrality(rt_g, directed = TRUE)
# Plot histogram
hist(rt_ec$vector, breaks = 100, xlim = c(0, .2), main = "Eigen Centrality")
# Get top 1% of vertices by eigen centrality
top_ec <- rt_ec$vector[rt_ec$vector > quantile(rt_ec$vector, .99)]
# Get top 1% of vertices by betweenness
top_btw <- rt_btw[rt_btw > quantile(rt_btw, .99)]
# Make a nice data frame to print, with three columns, Rank, Betweenness, and Eigencentrality
most_central <- as.data.frame(cbind(1:length(top_ec), names(sort(top_btw, decreasing = T)),
names(sort(top_ec, decreasing = T))
)
)
# Set column names
colnames(most_central) <- c("Rank", "Betweenness", "Eigen Centrality")
# Print out the data frame
print(most_central)
## Rank Betweenness Eigen Centrality
## 1 1 hadleywickham ma_salmon
## 2 2 kierisi rstudiotips
## 3 3 drob opencpu
## 4 4 opencpu AchimZeileis
## 5 5 ma_salmon dataandme
## 6 6 rmflight drob
## 7 7 dataandme _ColinFay
## 8 8 _ColinFay rOpenSci
## 9 9 juliasilge kearneymw
## 10 10 revodavid RobertMylesMc
## 11 11 rOpenSci ptrckprry
## 12 12 nj_tierney rmflight
## 13 13 jonmcalder thosjleeper
## 14 14 Md_Harris revodavid
## 15 15 mauro_lepore juliasilge
## 16 16 sckottie RLadiesGlobal
## 17 17 RLadiesGlobal hadleywickham
## 18 18 kearneymw mauro_lepore
## 19 19 lenkiefer JennyBryan
## 20 20 NumFOCUS tudosgar
## 21 21 tjmahr cboettig
## 22 22 TheRealEveret antuki13
## 23 23 RLadiesMAD jasdumas
## 24 24 jasdumas Rbloggers
## 25 25 JennyBryan rensa_co
## 26 26 hrbrmstr timtrice
## 27 27 antuki13 daattali
## 28 28 Voovarb johnlray
## 29 29 timtrice joranelias
## 30 30 thinkR_fr StatsbyLopez
## 31 31 benmarwick kierisi
## 32 32 RosanaFerrero joshua_ulrich
## 33 33 clquezadar thinkR_fr
## 34 34 drsimonj ledell
## 35 35 zentree pssGuy
## 36 36 thomasp85 bastistician
## 37 37 OilGains zentree
## 38 38 yodacomplex brookLYNevery1
## 39 39 annakrystalli Md_Harris
## 40 40 davidhughjones sckottie
## 41 41 noamross jonmcalder
## 42 42 AlexaLFH nj_tierney
# Transform rt_btw and add as centrality
V(rt_g)$cent <- log(rt_btw+2)
# Visualize
plot(rt_g, vertex.label = NA, edge.arrow.width = .2,
edge.arrow.size = 0.0,
vertex.size = unlist(V(rt_g)$cent), vertex.color = "red")
# Create subgraph
rt_sub <-induced_subgraph(rt_g, V(rt_g)[V(rt_g)$cent >= quantile(V(rt_g)$cent, 0.99 )])
# Plot subgraph
plot(rt_sub, vertex.label = NA, edge.arrow.width = .2,
edge.arrow.size = 0.0,
vertex.size = unlist(V(rt_sub)$cent), vertex.color = "red")
ment_g <- read.graph("./RInputFiles/ment_g.gml", format=c("gml"))
ment_g
## IGRAPH 1c93213 DN-- 955 975 --
## + attr: id (v/n), name (v/c)
## + edges from 1c93213 (vertex names):
## [1] thinkR_fr ->ma_salmon thinkR_fr ->rstudio
## [3] thinkR_fr ->rforjournalists thinkR_fr ->aschinchon
## [5] thinkR_fr ->zedsamurai thinkR_fr ->ikashnitsky
## [7] thinkR_fr ->NSSDeviations thinkR_fr ->BeginTry
## [9] chrisderv ->pbaumgartner njogukennly->rstudio
## [11] ma_salmon ->rOpenSci ma_salmon ->RLadiesDC
## [13] ma_salmon ->marvin_dpr ma_salmon ->drob
## [15] ma_salmon ->kearneymw ma_salmon ->LucyStats
## + ... omitted several edges
rt_ratio <- degree(rt_g, mode="in") / (degree(rt_g, mode="out"))
ment_ratio <- degree(ment_g, mode="in") / (degree(ment_g, mode="out"))
# Create a dataframe to plot with ggplot
ratio_df <- data.frame(io_ratio = c(ment_ratio, rt_ratio))
ratio_df["graph_type"] <- c(rep("Mention", length(ment_ratio)), rep("Retweet", length(rt_ratio)) )
ratio_df_filtered <- ratio_df %>% filter(!is.infinite(io_ratio) & io_ratio > 0)
# Plot the graph
ggplot(ratio_df, aes(x = io_ratio , fill= graph_type, group = graph_type)) +
geom_density(alpha = .5) +
xlim(0, 10)
## Warning: Removed 891 rows containing non-finite values (stat_density).
# Check the mean and median of each ratio
ratio_df %>% group_by(graph_type) %>% summarise(m_ratio = mean(io_ratio))
## # A tibble: 2 x 2
## graph_type m_ratio
## <chr> <dbl>
## 1 Mention Inf
## 2 Retweet Inf
ratio_df %>% group_by(graph_type) %>% summarise(med = median(io_ratio))
## # A tibble: 2 x 2
## graph_type med
## <chr> <dbl>
## 1 Mention Inf
## 2 Retweet 0
ratio_df %>% filter(io_ratio != +Inf) %>% group_by(graph_type) %>% summarise(m_ratio = mean(io_ratio))
## # A tibble: 2 x 2
## graph_type m_ratio
## <chr> <dbl>
## 1 Mention 0.294
## 2 Retweet 0.268
ratio_df %>% filter(io_ratio != +Inf) %>% group_by(graph_type) %>% summarise(med = median(io_ratio))
## # A tibble: 2 x 2
## graph_type med
## <chr> <dbl>
## 1 Mention 0
## 2 Retweet 0
# Plot mention graph
plot(ment_g, vertex.label = NA, edge.arrow.width = .8,
edge.arrow.size = 0.2,
margin = 0,
vertex.size = 3)
# Find the assortivity of each graph
assortativity_degree(rt_g, directed = TRUE)
## [1] -0.1502212
assortativity_degree(ment_g, directed = TRUE)
## [1] -0.07742748
# Find the reciprocity of each graph
reciprocity(rt_g)
## [1] 0.005948447
reciprocity(ment_g)
## [1] 0.01846154
# Get size 3 cliques
clq_list <- cliques(ment_g, min = 3, max = 3)
## Warning in cliques(ment_g, min = 3, max = 3): At igraph_cliquer.c:56 :Edge
## directions are ignored for clique calculations
# Convert to a dataframe and filter down to just revodavid cliques
clq_df <- data.frame(matrix(names(unlist(clq_list)), nrow = length(clq_list), byrow = T))
rev_d <- clq_df %>% filter(X1 == "revodavid" | X2 == "revodavid" | X3 == "revodavid") %>% droplevels()
# Create empty graph and build it up
clq_g_empty <- graph.empty()
clq_g <- clq_g_empty + vertices(unique(unlist(rev_d)))
for(i in 1:dim(rev_d)[1]){
clq_g <- clq_g + edges(rev_d[i, 1], rev_d[i, 2])
clq_g <- clq_g + edges(rev_d[i, 2], rev_d[i, 3])
clq_g <- clq_g + edges(rev_d[i, 1], rev_d[i, 3])}
# Trim graph and plot using `simplify()`
clq_g_trimmed <- as.undirected(simplify(clq_g))
plot(clq_g_trimmed)
# Find the communities
rt_fgc <- cluster_fast_greedy(as.undirected(rt_g))
rt_info <- cluster_infomap(as.undirected(rt_g))
rt_clust <- cluster_louvain(as.undirected(rt_g))
# Compare all the communities
compare(rt_fgc, rt_clust, method = 'vi')
## [1] 2.144703
compare(rt_info, rt_clust, method = 'vi')
## [1] 1.623552
compare(rt_fgc, rt_info, method = 'vi')
## [1] 2.324274
# Test membership of the same users
fgc_test <- which(names(membership(rt_fgc)) %in% c("bass_analytics", "big_data_flow"))
membership(rt_fgc)[fgc_test]
## bass_analytics big_data_flow
## 3 3
info_test <- which(names(membership(rt_info)) %in% c("bass_analytics", "big_data_flow"))
membership(rt_info)[info_test]
## bass_analytics big_data_flow
## 102 77
# The crossing() function in igraph will return true if a particular edge crosses communities
# This is useful when we want to see certain vertices that are bridges between communities
# Assign cluster membership to each vertex in rt_g using membership()
V(rt_g)$clust <- membership(rt_clust)
# Assign crossing value to each edge
E(rt_g)$cross <- crossing(rt_clust, rt_g)
# Plot the whole graph (this is probably a mess)
plot(rt_g, vertex.label = NA, edge.arrow.width = .8, edge.arrow.size = 0.2,
coords = layout_with_fr(rt_g), margin = 0, vertex.size = 3,
vertex.color = V(rt_g)$clust, edge.color = E(rt_g)$cross+1)
# Create a subgraph with just a few communities greater than 50 but less than 90 in size
mid_comm <- as.numeric(names(sizes(rt_clust)[sizes(rt_clust) > 50 & sizes(rt_clust) < 90 ]))
rt_sg <- induced.subgraph(rt_g, V(rt_g)[ clust %in% mid_comm ])
# Plot the subgraph
plot(rt_sg, vertex.label = NA, edge.arrow.width = .8, edge.arrow.size = 0.2,
coords = layout_with_fr(rt_sg), margin = 0, vertex.size = 3,
vertex.color = V(rt_sg)$clust, edge.color = E(rt_sg)$cross+1)
Chapter 3 - Bike Sharing in Chicago
Creating our graph from raw data:
Compare Graph Distance vs. Geographic Distance:
st1 <- divy_bike_df %>% filter(from_station_id == station_1 ) %>% sample_n(1) %>% select(from_longitude, from_latitude) st2 <- divy_bike_df %>% filter(from_station_id == station_2 ) %>% sample_n(1) %>% select(from_longitude, from_latitude) farthest_dist <- distm(st1, st2, fun = distHaversine) return(farthest_dist) Connectivity:
Example code includes:
bike_dat <- readr::read_csv("./RInputFiles/divvy_bike_sample.csv")
## Parsed with column specification:
## cols(
## tripduration = col_double(),
## from_station_id = col_double(),
## from_station_name = col_character(),
## to_station_id = col_double(),
## to_station_name = col_character(),
## usertype = col_character(),
## gender = col_character(),
## birthyear = col_double(),
## from_latitude = col_double(),
## from_longitude = col_double(),
## to_latitude = col_double(),
## to_longitude = col_double(),
## geo_distance = col_double()
## )
glimpse(bike_dat)
## Observations: 52,800
## Variables: 13
## $ tripduration <dbl> 295, 533, 1570, 2064, 2257, 296, 412, 948, 826, 3...
## $ from_station_id <dbl> 49, 165, 25, 300, 85, 174, 75, 45, 85, 99, 301, 3...
## $ from_station_name <chr> "Dearborn St & Monroe St", "Clark St & Waveland A...
## $ to_station_id <dbl> 174, 308, 287, 296, 313, 198, 56, 147, 174, 99, 5...
## $ to_station_name <chr> "Canal St & Madison St", "Seeley Ave & Roscoe St"...
## $ usertype <chr> "Subscriber", "Subscriber", "Customer", "Customer...
## $ gender <chr> "Male", "Male", NA, NA, "Male", "Female", "Male",...
## $ birthyear <dbl> 1964, 1972, NA, NA, 1963, 1973, 1989, 1965, 1983,...
## $ from_latitude <dbl> 41.88132, 41.95078, 41.89766, 41.93773, 41.90096,...
## $ from_longitude <dbl> -87.62952, -87.65917, -87.62351, -87.64409, -87.6...
## $ to_latitude <dbl> 41.88209, 41.94340, 41.88032, 41.94011, 41.92586,...
## $ to_longitude <dbl> -87.63983, -87.67962, -87.63519, -87.64545, -87.6...
## $ geo_distance <dbl> 858.9672, 1881.5034, 2159.4804, 287.8546, 3044.07...
# Create trip_df_subs
trip_df_subs <- bike_dat %>%
filter(usertype == "Subscriber") %>%
group_by(from_station_id, to_station_id) %>%
summarise(weights = n())
# Create igraph object
trip_g_subs <- graph_from_data_frame(trip_df_subs[, 1:2])
# Add edge weights
E(trip_g_subs)$weights <- trip_df_subs$weights / sum(trip_df_subs$weights)
# Now work the same code and filter it down to non-subs
trip_df_non_subs <- bike_dat %>%
filter(usertype == "Customer") %>%
group_by(from_station_id, to_station_id) %>%
summarise(weights = n())
# Create igraph object
trip_g_non_subs <- graph_from_data_frame(trip_df_non_subs[, 1:2])
# Add edge weights
E(trip_g_non_subs)$weights <- trip_df_non_subs$weights / sum(trip_df_non_subs$weights)
# Now let's compare these graphs
gsize(trip_g_subs)
## [1] 14679
gsize(trip_g_non_subs)
## [1] 9528
# Create the subgraphs
sg_sub <- induced_subgraph(trip_g_subs, 1:12)
sg_non_sub <- induced_subgraph(trip_g_non_subs, 1:12)
# Plot sg_sub
plot(sg_sub, vertex.size = 20, edge.arrow.width = .8, edge.arrow.size = 0.4,
margin = 0, edge.width = E(sg_sub)$weights*10000, main = "Subscribers")
# Plot sg_non_sub
plot(sg_non_sub, vertex.size = 20, edge.arrow.width = .8, edge.arrow.size = 0.4,
margin = 0, vertex.size = 10, edge.width = E(sg_non_sub)$weights*10000,
main = "Customers")
bike_dist <- function(station_1, station_2, divy_bike_df){
st1 <- divy_bike_df %>% filter(from_station_id == station_1 ) %>% sample_n(1) %>% select(from_longitude, from_latitude)
st2 <- divy_bike_df %>% filter(from_station_id == station_2 ) %>% sample_n(1) %>% select(from_longitude, from_latitude)
farthest_dist <- geosphere::distm(st1, st2, fun = geosphere::distHaversine)
return(farthest_dist)
}
# See the diameter of each graph
get_diameter(trip_g_subs)
## + 7/300 vertices, named, from 48d7217:
## [1] 200 336 267 150 45 31 298
get_diameter(trip_g_non_subs)
## + 6/299 vertices, named, from 48e9473:
## [1] 116 31 25 137 135 281
# Find the farthest vertices
farthest_vertices(trip_g_subs)
## $vertices
## + 2/300 vertices, named, from 48d7217:
## [1] 200 298
##
## $distance
## [1] 6
farthest_vertices(trip_g_non_subs)
## $vertices
## + 2/299 vertices, named, from 48e9473:
## [1] 116 281
##
## $distance
## [1] 5
# See how far apart each one is and compare the distances
bike_dist(200, 298, bike_dat)
## [,1]
## [1,] 17078.31
bike_dist(116, 281, bike_dat)
## [,1]
## [1,] 7465.656
# Create trip_df
trip_df <- bike_dat %>%
group_by(from_station_id, to_station_id) %>%
summarise(weights = n())
# Create igraph object
trip_g_df <- graph_from_data_frame(trip_df[, 1:2])
# Add edge weights
E(trip_g_df)$weights <- trip_df$weights / sum(trip_df$weights)
trip_g_simp <- simplify(trip_g_df, remove.multiple=FALSE)
trip_g_simp
## IGRAPH 492d629 DN-- 300 18773 --
## + attr: name (v/c), weights (e/n)
## + edges from 492d629 (vertex names):
## [1] 5 ->14 5 ->16 5 ->25 5 ->29 5 ->33 5 ->35 5 ->36 5 ->37 5 ->43
## [10] 5 ->49 5 ->51 5 ->52 5 ->53 5 ->55 5 ->59 5 ->66 5 ->68 5 ->72
## [19] 5 ->74 5 ->75 5 ->76 5 ->81 5 ->85 5 ->90 5 ->92 5 ->97 5 ->98
## [28] 5 ->99 5 ->100 5 ->108 5 ->110 5 ->111 5 ->117 5 ->120 5 ->128 5 ->134
## [37] 5 ->135 5 ->137 5 ->140 5 ->141 5 ->144 5 ->146 5 ->148 5 ->149 5 ->168
## [46] 5 ->169 5 ->171 5 ->174 5 ->175 5 ->176 5 ->177 5 ->178 5 ->181 5 ->191
## [55] 5 ->192 5 ->193 5 ->194 5 ->198 5 ->210 5 ->214 5 ->218 5 ->227 5 ->233
## [64] 5 ->237 5 ->255 5 ->264 5 ->268 5 ->273 5 ->277 5 ->291 5 ->309 5 ->321
## + ... omitted several edges
# Find the degree distribution
trip_out <- degree(trip_g_simp, mode = "out")
trip_in <- degree(trip_g_simp, mode = "in")
# Create a data frame for easier filtering
trip_deg <- data.frame(cbind(trip_out, trip_in))
trip_deg$station_id <- names(trip_out)
trip_deg_adj <- trip_deg %>% mutate(ratio = trip_out / trip_in)
# Filter out rarely traveled to stations
trip_deg_filter <- trip_deg_adj %>% filter(trip_out > 10) %>% filter(trip_in > 10)
# Plot histogram
hist(trip_deg_filter$ratio)
# See which stations were the most skewed using which.min() and which.max()
trip_deg_filter %>% slice(which.min(ratio))
## trip_out trip_in station_id ratio
## 1 14 24 207 0.5833333
trip_deg_filter %>% slice(which.max(ratio))
## trip_out trip_in station_id ratio
## 1 19 11 135 1.727273
# If the weights are the same across all stations, then an unweighted degree ratio would work
# But if we want to know how many bikes are actually flowing, we need to consider weights
# The weighted analog to degree distribution is strength
# We can calculate this with the strength() function, which presents a weighted degree distribution based on the weight attribute of a graph's edges
# Calculate the weighted in and out degrees
trip_out_w <- strength(trip_g_simp, mode = "out")
trip_in_w <- strength(trip_g_simp, mode = "in")
# Create a data frame for easier filtering
trip_deg_w <- data.frame(cbind(trip_out_w, trip_in_w))
trip_deg_w$station_id <- names(trip_out_w)
trip_deg_w_adj <- trip_deg_w %>% mutate(ratio = trip_out_w / trip_in_w)
# Filter out rarely traveled to stations
trip_deg_w_filter <- trip_deg_w_adj %>% filter(trip_out_w > 10) %>% filter(trip_in_w > 10)
# Plot histogram of ratio
hist(trip_deg_w_filter$ratio)
# See which stations were the most skewed using which.min() and which.max()
trip_deg_w_filter %>% slice(which.min(ratio))
## trip_out_w trip_in_w station_id ratio
## 1 14 24 207 0.5833333
trip_deg_w_filter %>% slice(which.max(ratio))
## trip_out_w trip_in_w station_id ratio
## 1 19 11 135 1.727273
latlong <- data.frame(from_longitude=c(-87.656495, -87.660996, -87.6554864, -87.642746, -87.67328, -87.661535, -87.623727, -87.668745, -87.65103, -87.666507, -87.666611),
from_latitude=c(41.858166, 41.869417, 41.8694821, 41.880422, 41.87501, 41.857556, 41.864059, 41.857901, 41.871737, 41.865234, 41.891072)
)
# Create a sub graph of the least traveled graph 275
g275 <- make_ego_graph(trip_g_simp, 1, nodes = "275", mode= "out")[[1]]
# Plot graph with geographic coordinates
plot(g275, layout = as.matrix(latlong), vertex.label.color = "blue", vertex.label.cex = .6,
edge.color = 'black', vertex.size = 15, edge.arrow.size = .1,
edge.width = E(g275)$weight, main = "Lat/Lon Layout")
# Plot graph without geographic coordinates
plot(g275, vertex.label.color = "blue", vertex.label.cex = .6,
edge.color = 'black', vertex.size = 15, edge.arrow.size = .1,
edge.width = E(g275)$weight,
main = "Default Layout")
# Eigen centrality weighted
ec_weight <- eigen_centrality(trip_g_simp, directed = T, weights = NULL)
# Eigen centrality unweighted
ec_unweight <- eigen_centrality(trip_g_simp, directed = T, weights = NA)
# Closeness weighted
close_weight <- closeness(trip_g_simp, weights = NULL)
# Closeness unweighted
close_unweight <- closeness(trip_g_simp, weights = NA)
# Output nicely with cbind()
cbind(c(
names(V(trip_g_simp))[which.min(ec_weight$vector)],
names(V(trip_g_simp))[which.min(close_weight)],
names(V(trip_g_simp))[which.min(ec_unweight$vector)],
names(V(trip_g_simp))[which.min(close_unweight)]
), c("Weighted Eigen Centrality", "Weighted Closeness", "Unweighted Eigen Centrality", "Unweighted Closeness")
)
## [,1] [,2]
## [1,] "204" "Weighted Eigen Centrality"
## [2,] "336" "Weighted Closeness"
## [3,] "204" "Unweighted Eigen Centrality"
## [4,] "336" "Unweighted Closeness"
trip_g_ud <- as.undirected(trip_g_simp)
trip_g_ud
## IGRAPH 4965815 UN-- 300 12972 --
## + attr: name (v/c)
## + edges from 4965815 (vertex names):
## [1] 5 --14 14--15 5 --16 13--16 15--16 16--17 5 --19 14--19 15--19 5 --20
## [11] 13--20 16--20 17--20 14--21 16--21 17--21 19--21 14--22 15--22 19--22
## [21] 21--22 5 --23 17--23 20--23 5 --24 16--24 17--24 23--24 5 --25 13--25
## [31] 16--25 17--25 20--25 21--25 22--25 23--25 24--25 5 --26 17--26 19--26
## [41] 20--26 24--26 25--26 13--27 16--27 20--27 24--27 26--27 17--28 20--28
## [51] 5 --29 16--29 17--29 20--29 24--29 25--29 26--29 16--30 17--30 19--30
## [61] 20--30 22--30 29--30 13--31 14--31 16--31 17--31 20--31 21--31 22--31
## [71] 23--31 24--31 25--31 26--31 28--31 30--31 15--32 19--32 21--32 22--32
## + ... omitted several edges
# Find the minimum number of cuts using min_cut()
ud_cut <- min_cut(trip_g_ud, value.only = FALSE)
# Print the vertex with the minimum number of cuts
print(ud_cut$partition1)
## + 1/300 vertex, named, from 4965815:
## [1] 281
# Make an ego graph
g<- make_ego_graph(trip_g_ud, 1, nodes = "281")[[1]]
plot(g, edge.color = 'black', edge.arrow.size = .1)
# Print the value
print(ud_cut$value)
## [1] 5
# Print cut object
print(ud_cut$cut)
## + 5/12972 edges from 4965815 (vertex names):
## [1] 71 --281 135--281 167--281 203--281 281--305
far_stations <- c("231", "321")
close_stations <- c("231", "213")
# Compare the output of close and far vertices
stMincuts(trip_g_simp, far_stations[1], far_stations[2])$value
## [1] 54
stMincuts(trip_g_simp, close_stations[1], close_stations[2])$value
## [1] 49
# Find the actual value
clust_coef <- transitivity(trip_g_simp, type = "global")
# Get randomization parameters using gorder() and edge_density()
nv <- gorder(trip_g_simp)
ed <- edge_density(trip_g_simp)
# Create an empty vector to hold output of 300 simulations
graph_vec <- rep(NA, 300)
# Calculate clustering for random graphs
for(i in 1:300){
graph_vec[i]<- transitivity(erdos.renyi.game(nv, ed, "gnp", directed = T), type = "global")
}
# Plot a histogram of the simulated values
hist(graph_vec, xlim = c(.35, .6), main = "Unweighted clustering randomization")
# Add a line with the true value
abline(v = clust_coef, col = "red")
# Find the mean local weighted clustering coeffecient
m_clust <- mean(transitivity(trip_g_simp, type = "weighted"))
nv <- gorder(trip_g_simp)
ed <- edge_density(trip_g_simp)
graph_vec <- rep(NA, 100)
for(i in 1:100){
g_temp <- erdos.renyi.game(nv, ed, "gnp", directed = T)
# Sample existing weights and add them to the random graph
E(g_temp)$weight <- sample(x = E(trip_g_simp)$weights, size = gsize(g_temp), replace = TRUE)
graph_vec[i]<- mean(transitivity(g_temp, type = "weighted"))
}
# Plot a histogram of the simulated values
hist(graph_vec, xlim = c(.35, .7), main = "Unweighted clustering randomization")
# Add a line with the true value
abline(v = m_clust ,col = "red")
Chapter 4 - Other Ways to Visualize Graph Data
Other packages for plotting graphs:
Interactive visualizations:
Alternative visualizations:
Example code includes:
verts <- c(1185, 3246, 1684, 3634, 3870, 188, 2172, 3669, 2267, 1877, 3931, 1862, 2783, 2351, 423, 3692, 1010, 173, 1345, 3913, 3646, 2839, 2624, 4072, 2685, 2901, 2227, 2431, 1183, 602, 3937, 3688, 2823, 3250, 101, 1951, 3097, 884, 1299, 945, 583, 1691, 1687, 1504, 622, 566, 949, 1897, 1083, 3491, 187, 1799, 3249, 496, 2280, 840, 519, 3060, 4115, 1520, 2700, 385, 1558, 1113, 3303, 1818, 3283, 3291, 3218, 1781, 3055, 2547, 2874, 3, 1923, 890, 1536, 2477, 1422, 449, 984, 2697, 1686, 3181, 415, 1754, 3972, 3600, 3573, 706, 527, 2631, 1383, 2644, 1290, 756, 3147, 377, 4109, 2056, 2411, 1337, 1963, 3833, 1939, 4030, 4111, 2442, 1647, 590, 3749, 1208, 244, 3796, 2886, 570, 2199, 3818, 2342, 1618, 2591, 1279, 1230, 878, 1476, 3930, 616, 364, 567, 2753, 2470, 3554, 2683, 2938, 2077, 2629, 3273, 3131, 3900, 1749, 1240, 1629, 42, 731, 3350, 919, 950, 305, 976, 2906, 3363, 1974, 1539, 978, 441, 1546, 4110, 860, 1762, 864, 1989, 1401, 2572, 1482, 1406, 2110, 2926, 874, 1631, 1050, 2488, 726, 3408, 2946, 2636, 2437, 1468, 2089, 3447, 2292, 3308, 1231, 2788, 1043, 2339, 1893, 3935, 2220, 3589, 3544, 1077, 1263, 4114, 2434, 3679, 1831, 1596, 2585, 598, 2246, 936, 3770, 2355, 2017, 1576, 3445, 1425, 1128, 668, 674, 1884, 989, 845, 2634, 4068, 2736, 1374, 3922, 3202, 3583, 1102, 3746, 2838, 2674, 206, 3966, 1860, 2180, 2717, 3562, 2405, 1666, 2107, 228, 1014, 1543, 768, 3229, 594, 3117, 2121, 2568, 666, 2454, 1209, 2807, 1545, 3753, 3744, 2812, 995, 858, 2293, 1034, 2053, 3034, 650, 1562, 1821, 3351, 3572, 3402, 2600, 3663, 1991, 2222, 1296, 1338, 78, 1936, 3352, 25, 278, 632, 2962, 2826, 3734, 1792, 286, 2491, 2912, 4028, 1522, 863, 223, 1518, 249, 866, 210, 2567, 1140, 386, 276, 3368, 2885, 3122, 3754, 396, 379, 3051, 2996, 36, 2973, 4106, 2404, 1834, 3920, 32, 1724, 1876, 1484, 1769, 2715, 211, 1350, 3054, 3178, 904, 1346, 3256, 3243, 1124, 559, 2672, 394, 128, 3790, 133, 1283, 3468, 3934, 1085, 2794, 3157, 1190, 1864, 2638, 2426, 2435, 3696, 1567, 451, 1987, 850, 1836, 1397, 3710, 1465, 865, 2350, 515, 3645, 1940, 614, 2341, 3711, 2516, 3914, 1216, 3140, 541, 725, 3369, 1157, 1364, 2943, 3947, 67, 1525, 1812, 1582, 1285, 4117, 1705, 1999, 3608, 2899, 782, 1155, 3632, 2187, 2844, 1393, 2873, 2008, 3412, 692, 1053, 355, 785, 3643, 1105, 2706, 2927, 393, 893, 1007, 4021, 439, 3687, 3667, 510, 3365, 2141, 1469, 1671, 2623, 307, 1259, 2526, 1176, 3083, 798, 1845, 1023, 712, 3520, 1191, 1771, 104, 2025, 2382, 2204, 3784, 3292, 2313, 1119, 1433, 593, 3182, 3516, 2079, 1215, 3673, 3831, 2257, 399, 1793, 366, 3690, 1041, 2147, 2690, 609, 3184, 2603, 2793, 540, 1315, 2471, 1922, 3792, 882, 214, 867, 3261, 3816, 2737, 3990, 457, 3566, 1595, 1697, 605, 2138, 990, 841, 2524, 1033, 2958, 343, 2998, 1559, 2756, 2414, 1620, 2285, 2, 791, 2566, 783, 2961, 1120, 2500, 3390, 421, 464, 2463, 4056, 3029, 3525, 256, 1668, 2544, 316, 3598, 917, 180, 2485, 2848, 1280, 1326, 1039, 290, 1321, 644)
verts <- c(verts, 1937, 1820, 3733, 1232, 1677, 298, 3102, 1427, 2653, 619, 1639, 2774, 226, 2934, 1084, 1312, 1123, 135, 1865, 2440, 3245, 92, 3551, 1088, 3370, 2467, 1604, 2928, 142, 2648, 1250, 2970, 1918, 983, 2866, 328, 2976, 3653, 2692, 4099, 291, 3819, 2864, 1375, 1169, 732, 2031, 3166, 1888, 2092, 2372, 1887, 1816, 58, 170, 3306, 3903, 715, 2312, 2323, 1404, 3824, 1942, 3142, 1964, 3214, 2084, 1502, 3366, 2513, 1464, 66, 2007, 1735, 3109, 2876, 3021, 1301, 3089, 535, 996, 3916, 3451, 2057, 1858, 215, 3417, 424, 312, 3103, 1791, 1189, 3149, 113, 835, 2415, 794, 3636, 612, 2816, 514, 2889, 1162, 1313, 2210, 339, 3850, 3481, 2047, 2739, 3124, 2643, 3428, 155, 3161, 3027, 2711, 1317, 148, 1273, 956, 2969, 1265, 1063, 3899, 3945, 1597, 2543, 363, 767, 3322, 2618, 2850, 1454, 2066, 2778, 3534, 1339, 314, 2174, 2589, 297, 3932, 2132, 2612, 3180, 1649, 1966, 2552, 3581, 3148, 196, 1741, 1213, 2924, 3936, 406, 3631, 813, 259, 3230, 543, 2233, 599, 70, 1797, 3607, 975, 1448, 2022, 2777, 696, 1581, 1542, 2523, 2457, 2857, 3046, 3272, 1891, 3681, 586, 1644, 871, 137, 2176, 1849, 480, 972, 1996, 565, 330, 1466, 1217, 2888, 889, 80, 3487, 1143, 2157, 3594, 3747, 634, 1463, 2150, 1775, 2247, 2484, 1658, 1309, 24, 13, 3383, 367, 1423, 2439, 2522, 3637, 2064, 3639, 4046, 2078, 3676, 3506, 1413, 2964, 2192, 3130, 4078, 1069, 2720, 3344, 1090, 5, 3848, 501, 167, 3915, 3787, 4049, 3986, 233, 2343, 3196, 3918, 4063, 537, 242, 3809, 1648, 1662, 2986, 124, 685, 1726, 4087, 1932, 3999, 1910, 484, 489, 1382, 2289, 2189, 3067, 2722, 2262, 2702, 429, 839, 1109, 1361, 2123, 4058, 3959, 2735, 52, 2183, 2707, 1538, 678, 63, 943, 3047, 3108, 1806, 730, 1628, 2664, 1355, 345, 932, 1201, 861, 3861, 1214, 403, 156, 3429, 3210, 3355, 1583, 2479, 3508, 164, 2299, 3320, 2923, 2562, 460, 4013, 417, 1947, 1853, 2272, 1027, 1997, 3266, 2449, 250, 1486, 177, 1118, 3644, 14, 2538, 3836, 2368, 3349, 1879, 2310, 3413, 4032, 319, 3155, 2413, 3842, 3724, 1802, 3319, 2940, 31, 773, 426, 1067, 2374, 3240, 2335, 4010, 3398, 3096, 392, 245, 2898, 4026, 138, 2109, 1526, 2011, 881, 512, 372, 1650, 3373, 3659, 552, 2474, 1712, 3786, 2185, 43, 3406, 2890, 3504, 348, 2982, 2186, 481, 4018, 3048, 1360, 962, 838, 720, 1826, 4011, 2161, 1763, 2617, 2447, 65, 1227, 3938, 2569, 3662, 1746, 2742, 4020, 2148, 1643, 2450, 4093, 3905, 230, 3401, 168, 2779, 1847, 1006, 3074, 1894, 1702, 1229, 3704, 2586, 3595, 1163, 3661, 2230, 3236, 1111, 1770, 438, 2504, 2828, 651, 2456, 1900, 3050, 506, 1674, 3477, 2766, 76, 3606, 3630, 1237, 3617, 295, 3512, 1286, 3623, 3495, 964, 3407, 494, 3629, 140, 1178, 3045, 2041, 194, 3852, 3800, 1605, 1420, 1968, 442, 3570, 1796, 1729, 369, 2401, 1507, 2462, 145, 2580, 848, 4043, 3443, 2979, 22, 3727, 1316, 1437, 3450, 3590, 3465, 3188, 2373, 432, 3425, 3449, 1356, 273, 700, 1789, 1251, 1767, 3998, 2005, 1222, 2214, 340)
# Create subgraph of rt_g
rt_samp <- induced_subgraph(rt_g, verts)
# Convert from igraph using asNetwork()
net <- intergraph::asNetwork(rt_samp)
# Plot using igraph
plot(rt_samp, vertex.label = NA, edge.arrow.size = 0.2, edge.size = 0.5,
vertex.color = "black", vertex.size = 1
)
# Plot using ggnet2
GGally::ggnet2(net, node.size = 1, node.color = "black", edge.size = .4)
# Raw plot of rt_samp using ggnetwork()
library(ggnetwork)
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
ggplot(ggnetwork(rt_samp, arrow.gap = .01) , aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(arrow = arrow(length = unit(6, "pt"), type = "closed"), color = "black") +
geom_nodes(size = 4)
## Loading required package: sna
## Loading required package: statnet.common
##
## Attaching package: 'statnet.common'
## The following object is masked from 'package:base':
##
## order
## Loading required package: network
## network: Classes for Relational Data
## Version 1.13.0.1 created on 2015-08-31.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
## Mark S. Handcock, University of California -- Los Angeles
## David R. Hunter, Penn State University
## Martina Morris, University of Washington
## Skye Bender-deMoll, University of Washington
## For citation information, type citation("network").
## Type help("network-package") to get started.
##
## Attaching package: 'network'
## The following objects are masked from 'package:igraph':
##
## %c%, %s%, add.edges, add.vertices, delete.edges,
## delete.vertices, get.edge.attribute, get.edges,
## get.vertex.attribute, is.bipartite, is.directed,
## list.edge.attributes, list.vertex.attributes,
## set.edge.attribute, set.vertex.attribute
## sna: Tools for Social Network Analysis
## Version 2.4 created on 2016-07-23.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
## For citation information, type citation("sna").
## Type help(package="sna") to get started.
##
## Attaching package: 'sna'
## The following objects are masked from 'package:igraph':
##
## betweenness, bonpow, closeness, components, degree,
## dyad.census, evcent, hierarchy, is.connected, neighborhood,
## triad.census
# Prettier plot of rt_samp using ggnetwork()
ggplot(ggnetwork(rt_samp, arrow.gap = .01),aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(arrow = arrow(length = unit(6, "pt"), type = "closed"), color = "black", curvature = .2) +
geom_nodes(size = 4) + theme_blank()
# NEED TO FIX!
rt_keys <- sort(table(vertex_attr(rt_g)$clust), decreasing=TRUE)
# rt_drops <- names(rt_keys)[11:length(rt_keys)]
# vt_drops <- which(vertex_attr(rt_g)$clust %in% rt_drops)
# rt_use <- delete_vertices(rt_g, vt_drops)
rt_use <- induced_subgraph(rt_g, which(V(rt_g)$clust %in% names(rt_keys[1:10])))
# Convert to a network object
net <- intergraph::asNetwork(rt_use)
ggnet2(net, node.size = "cent", node.color = "clust", edge.size = .1,
color.legend = "Community Membership", color.palette = "Spectral"
)
# Now remove the centrality legend by setting size to false in the guide() function
ggnet2(net, node.size = "cent", node.color = "clust", edge.size = .1,
color.legend = "Community Membership", color.palette = "Spectral"
) +
guides( size = FALSE)
# Add edge colors
ggnet2(net, node.size = "cent", node.color = "clust", edge.size = .1,
color.legend = "Community Membership", color.palette = "Spectral",
edge.color = c("color", "gray88")) +
guides( size = FALSE)
# NEED TO CREATE rt_g_smaller!
# Basic plot where we set parameters for the plots using geom_edegs() and geom_nodes()
# ggplot(ggnetwork(rt_g_smaller, arrow.gap = .01), aes(x = x, y = y, xend = xend, yend = yend)) +
# geom_edges(arrow = arrow(length = unit(6, "pt"), type = "closed"), curvature = .2, color = "black") +
# geom_nodes(size = 4, aes(color = comm)) +
# theme_blank()
# Added guide legend, changed line colors, added size
# ggplot(ggnetwork(rt_g_smaller, arrow.gap = .01), aes(x = x, y = y, xend = xend, yend = yend)) +
# geom_edges(arrow = arrow(length = unit(6, "pt"), type = "closed"), curvature = .2, lwd = .3, aes(color=comm)) +
# geom_nodes(aes(color = comm, size = cent)) +
# theme_blank() +
# guides(color = guide_legend(title = "Community"), size = guide_legend(title = "Centrality"))
# NEED TO FIX!
# Add betweenness centrality using betweenness()
V(trip_g_simp)$cent <- igraph::betweenness(trip_g_simp)
# Create a ggplot object with ggnetwork to render using ggiraph
g <- ggplot(ggnetwork(trip_g_simp, arrow.gap = .01), aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(color = "black") +
geom_nodes(aes(size = cent)) +
theme_blank()
plot(g)
# Create ggiraph object and assign the tooltip to be interactive
my_gg <- g + ggiraph::geom_point_interactive(aes(tooltip = round(cent, 2),
data_id = round(cent, 2)
), size = 2
)
# Define some hover css so the cursor turns red
hover_css = "cursor:pointer;fill:red;stroke:red;r:3pt"
# ggiraph::ggiraph(code = print(my_gg), hover_css = hover_css, tooltip_offx = 10, tooltip_offy = -10)
# Add community membership as a vertex attribute using the cluster_walktrap algorithm
V(rt_g)$comm <- membership(cluster_walktrap(rt_g))
# Create an induced_subgraph
rt_sub_g <- induced_subgraph(rt_g, which(V(rt_g)$comm %in% 10:13))
# Plot to see what it looks like without an interactive plot using ggnetwork
ggplot(ggnetwork(rt_sub_g, arrow.gap = .01), aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(color = "black") +
geom_nodes(aes(color = as.factor(comm))) +
theme_blank()
# Convert to a networkD3 object
# nd3 <- igraph_to_networkD3(rt_sub_g)
# Assign grouping factor as community membership
# nd3$nodes$group = V(rt_sub_g)$comm
# Render your D3.js graph
# forceNetwork(Links = nd3$links, Nodes = nd3$nodes, Source = 'source',
# Target = 'target', NodeID = 'name', Group = 'group', legend = T, fontSize = 20
# )
# Convert trip_df to hive object using edge2HPD()
# bike_hive <- edge2HPD(edge_df = as.data.frame(trip_df))
# Assign to trip_df edgecolor using our custom function
# trip_df$edgecolor <- dist_gradient(trip_df$geodist)
# Calculate centrality with betweenness()
# bike_cent <- betweenness(trip_g)
# Add axis and radius based on longitude and radius
# bike_hive$nodes$radius<- ifelse(bike_cent > 0, bike_cent, runif(1000, 0, 3))
# Set axis as integers and axis colors to black
# bike_hive$nodes$axis <- as.integer(dist_stations$axis)
# bike_hive$axis.cols <- rep("black", 3)
# Set the edge colors to a heatmap based on trip_df$edgecolor
# bike_hive$edges$color <- trip_df$edgecolor
# plotHive(bike_hive, method = "norm", bkgnd = "white")
# Add community membership as a vertex attribute
V(rt_g)$comm <- membership(cluster_walktrap(rt_g))
# Create a subgraph
rt_sub_g <- induced_subgraph(rt_g, which(V(rt_g)$comm %in% 10:15))
# Plot to see what it looks like without an interactive plot
ggplot(ggnetwork(rt_sub_g, arrow.gap = .01), aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(color = "black") +
geom_nodes(aes(color = as.factor(comm)))+ theme_blank() +
theme(legend.position = "none")
# Make a Biofabric plot htmlwidget
# rt_bf <- bioFabric(rt_sub_g)
# bioFabric_htmlwidget(rt_bf)
# Create a dataframe of start and end latitude and longitude and add weights
# ll_to_plot <- bike_dat %>% group_by(from_station_id, to_station_id, from_latitude,
# from_longitude, to_latitude, to_longitude, usertype
# ) %>%
# summarise(weight = n())
# Create a base map with station points with ggmap()
# ggmap(chicago) +
# geom_segment(data = ll_to_plot, aes(x = from_longitude, y = from_latitude,
# xend = to_longitude, yend = to_latitude,
# colour = usertype, size = weight
# ), alpha = .5
# )
Chapter 1 - What is Bayesian Analysis?
Introduction:
Bayesian data analysis - named for Thomas Bayes from the early-mid 1700s:
Samples and posterior samples:
Chapter wrap-up:
Example code includes:
prop_model <- function(data = c(), prior_prop = c(1, 1), n_draws = 10000) {
data <- as.logical(data)
proportion_success <- c(0, seq(0, 1, length.out = 100), 1)
data_indices <- round(seq(0, length(data), length.out = min(length(data) + 1, 20)))
post_curves <- map_dfr(data_indices, function(i) {
value <- ifelse(i == 0, "Prior", ifelse(data[i], "Success", "Failure"))
label <- paste0("n=", i)
probability <- dbeta(proportion_success, prior_prop[1] + sum(data[seq_len(i)]),
prior_prop[2] + sum(!data[seq_len(i)])
)
probability <- probability / max(probability)
data_frame(value, label, proportion_success, probability)
}
)
post_curves$label <- fct_rev(factor(post_curves$label, levels = paste0("n=", data_indices )))
post_curves$value <- factor(post_curves$value, levels = c("Prior", "Success", "Failure"))
p <- ggplot(post_curves, aes(x = proportion_success, y = label, height = probability, fill = value)) +
ggridges::geom_density_ridges(stat="identity", color = "white",
alpha = 0.8, panel_scaling = TRUE, size = 1
) +
scale_y_discrete("", expand = c(0.01, 0)) +
scale_x_continuous("Underlying proportion of success") +
scale_fill_manual(values = hcl(120 * 2:0 + 15, 100, 65), name = "",
drop = FALSE, labels = c("Prior ", "Success ", "Failure ")
) +
#ggtitle(paste0("Binomial model - Data: ", sum(data), " successes, " , sum(!data), " failures")) +
theme_light(base_size = 18) +
theme(legend.position = "top")
print(p)
invisible(rbeta(n_draws, prior_prop[1] + sum(data), prior_prop[2] + sum(!data)))
}
# Define data and run prop_model
data = c(1, 0, 0, 1)
prop_model(data)
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
# Define data and run prop_model
data = c(1, 0, 0, 1)
prop_model(data)
data = c(1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0)
posterior <- prop_model(data)
head(posterior)
## [1] 0.1036182 0.1555942 0.1115889 0.1805051 0.2815857 0.3078648
hist(posterior, breaks = 30, xlim = c(0, 1), col = "palegreen4")
# Get some more information about posterior
median(posterior)
## [1] 0.1882898
quantile(posterior, c(0.05, 0.95))
## 5% 95%
## 0.06031634 0.38918378
sum(posterior > 0.07) / length(posterior)
## [1] 0.9263
Chapter 2 - How Does Bayesian Inference Work?
Parts needed for Bayesian inference:
Using a generative model:
Repressing uncertainty with priors:
Bayesian models and conditioning:
Chapter wrap-up:
Example code includes:
# Generative zombie drug model
# Parameters
prop_success <- 0.42
n_zombies <- 100
# Simulating data
data <- c()
for(zombie in 1:n_zombies) {
data[zombie] <- runif(1, min = 0, max = 1) < prop_success
}
data <- as.numeric(data)
data
## [1] 1 1 1 1 0 1 0 1 1 1 0 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 0 1 0 1 1 1 1 1 0 1
## [38] 0 0 0 1 0 0 0 1 1 1 1 1 0 1 0 1 1 1 0 0 1 0 1 0 1 1 1 0 1 1 0 0 0 0 0 1 0
## [75] 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0
data_counts <- sum(as.numeric(data))
data_counts
## [1] 46
# Try out rbinom
rbinom(n = 1, size = 100, prob = 0.42)
## [1] 31
# Try out rbinom
rbinom(n = 200, size = 100, prob = 0.42)
## [1] 44 51 40 48 44 37 41 41 50 49 44 42 43 41 41 33 39 40 41 39 46 43 45 41 42
## [26] 38 41 45 45 39 41 48 45 45 43 41 36 39 45 44 37 37 39 51 43 31 42 39 42 42
## [51] 38 46 36 41 48 42 41 37 41 38 36 43 53 37 49 35 43 51 34 44 40 44 39 47 47
## [76] 40 43 38 41 38 40 36 35 37 37 40 41 47 39 39 45 45 47 44 36 49 33 40 44 39
## [101] 40 36 42 38 47 39 48 48 50 42 51 39 44 42 45 38 44 40 40 41 40 46 43 40 46
## [126] 49 40 48 43 43 39 39 41 44 45 43 43 41 48 38 47 42 45 38 44 44 38 40 36 38
## [151] 46 41 42 29 40 42 40 45 33 46 54 47 36 49 41 41 39 40 44 43 52 41 38 38 45
## [176] 49 49 39 47 40 42 37 43 46 42 41 47 48 38 50 42 40 38 36 49 39 46 37 42 36
# Fill in the parameters
n_samples <- 100000
n_ads_shown <- 100
proportion_clicks <- 0.1
n_visitors <- rbinom(n_samples, size = n_ads_shown, prob = proportion_clicks)
# Visualize the results
hist(n_visitors)
# Update proportion_clicks
n_samples <- 100000
n_ads_shown <- 100
proportion_clicks <- runif(n = n_samples, min = 0.0, max = 0.2)
n_visitors <- rbinom(n = n_samples, size = n_ads_shown, prob = proportion_clicks)
# Visualize the results
hist(n_visitors)
hist(proportion_clicks)
# Create prior
prior <- data.frame(proportion_clicks, n_visitors)
head(prior)
## proportion_clicks n_visitors
## 1 0.102175374 6
## 2 0.066737942 6
## 3 0.007708941 0
## 4 0.123223223 13
## 5 0.070207388 6
## 6 0.112998033 7
# Create posterior
posterior <- prior[prior$n_visitors==13, ]
hist(posterior$proportion_clicks)
prior <- posterior
head(prior)
## proportion_clicks n_visitors
## 4 0.1232232 13
## 21 0.1409067 13
## 44 0.0885686 13
## 49 0.1316969 13
## 61 0.1619553 13
## 76 0.1349985 13
prior$n_visitors <- rbinom(nrow(prior), size=100, prob=prior$proportion_clicks)
hist(prior$n_visitors)
mean(prior$n_visitors >= 5)
## [1] 0.9867033
Chapter 3 - Why Use Bayesian Data Analysis?
Four good things with Bayes:
Contrasts and comparisons:
Decision analysis:
Change anything and everything:
Bayes is optimal, kind of . . .
Example code includes:
# Draw from the beta distribution
beta_sample <- rbeta(n = 1000000, shape1 = 1, shape2 = 1)
# Explore the results
hist(beta_sample)
# Draw from the beta distribution
beta_sample <- rbeta(n = 10000, shape1 = 100, shape2 = 100)
# Explore the results
hist(beta_sample)
# Draw from the beta distribution
beta_sample <- rbeta(n = 10000, shape1 = 100, shape2 = 20)
# Explore the results
hist(beta_sample)
n_draws <- 100000
n_ads_shown <- 100
# Update proportion_clicks
proportion_clicks <- rbeta(n_draws, shape1 = 5, shape2 = 95)
n_visitors <- rbinom(n_draws, size = n_ads_shown, prob = proportion_clicks)
prior <- data.frame(proportion_clicks, n_visitors)
posterior <- prior[prior$n_visitors == 13, ]
# Plots the prior and the posterior in the same plot
par(mfcol = c(2, 1))
hist(prior$proportion_clicks,
xlim = c(0, 0.25))
hist(posterior$proportion_clicks,
xlim = c(0, 0.25))
# Reset mfcol below
# Define parameters
n_draws <- 100000
n_ads_shown <- 100
proportion_clicks <- runif(n_draws, min = 0.0, max = 0.2)
n_visitors <- rbinom(n = n_draws, size = n_ads_shown, prob = proportion_clicks)
prior <- data.frame(proportion_clicks, n_visitors)
# Create posteriors
posterior_video <- prior[prior$n_visitors == 13, ]
posterior_text <- prior[prior$n_visitors == 6, ]
# Visualize posteriors
hist(posterior_video$proportion_clicks, xlim = c(0, 0.25))
hist(posterior_text$proportion_clicks, xlim = c(0, 0.25))
posterior <- data.frame(video_prop = posterior_video$proportion_clicks[1:4000],
text_prop = posterior_text$proportion_click[1:4000]
)
# Create prop_diff
posterior$prop_diff <- posterior$video_prop - posterior$text_prop
# Plot your new column
hist(posterior$prop_diff)
# Explore prop_diff
median(posterior$prop_diff)
## [1] 0.06583102
mean(posterior$prop_diff > 0)
## [1] 0.947
visitor_spend <- 2.53
video_cost <- 0.25
text_cost <- 0.05
posterior$video_profit <- posterior$video_prop * visitor_spend - video_cost
posterior$text_profit <- posterior$text_prop * visitor_spend - text_cost
head(posterior)
## video_prop text_prop prop_diff video_profit text_profit
## 1 0.11438338 0.06966232 0.0447210643 0.03938996 0.12624567
## 2 0.08828099 0.08925227 -0.0009712789 -0.02664909 0.17580825
## 3 0.13337523 0.03383959 0.0995356356 0.08743932 0.03561416
## 4 0.11821430 0.08126194 0.0369523543 0.04908217 0.15559272
## 5 0.10437450 0.07691290 0.0274615984 0.01406748 0.14458963
## 6 0.13520477 0.07912402 0.0560807510 0.09206808 0.15018378
hist(posterior$video_profit)
hist(posterior$text_profit)
posterior$profit_diff <- posterior$video_profit - posterior$text_profit
head(posterior)
## video_prop text_prop prop_diff video_profit text_profit profit_diff
## 1 0.11438338 0.06966232 0.0447210643 0.03938996 0.12624567 -0.08685571
## 2 0.08828099 0.08925227 -0.0009712789 -0.02664909 0.17580825 -0.20245734
## 3 0.13337523 0.03383959 0.0995356356 0.08743932 0.03561416 0.05182516
## 4 0.11821430 0.08126194 0.0369523543 0.04908217 0.15559272 -0.10651054
## 5 0.10437450 0.07691290 0.0274615984 0.01406748 0.14458963 -0.13052216
## 6 0.13520477 0.07912402 0.0560807510 0.09206808 0.15018378 -0.05811570
hist(posterior$profit_diff)
median(posterior$profit_diff)
## [1] -0.03344751
mean(posterior$profit_diff < 0)
## [1] 0.6345
x <- rpois(n = 10000, lambda = 3)
hist(x)
x <- rpois(n = 10000, lambda = 11.5)
hist(x)
x <- rpois(n = 10000, lambda = 11.5)
mean(x >= 15)
## [1] 0.182
n_draws <- 100000
n_ads_shown <- 100
mean_clicks <- runif(n_draws, min = 0, max = 80)
n_visitors <- rpois(n_draws, lambda=mean_clicks)
prior <- data.frame(mean_clicks, n_visitors)
posterior <- prior[prior$n_visitors == 19, ]
hist(prior$mean_clicks)
hist(posterior$mean_clicks)
# Reset to default
par(mfcol = c(1, 1))
Chapter 4 - Bayesian Inference with Bayes’ Theorem
Probability rules:
Calculating likelihoods:
Bayesian calculation:
Bayes theorem:
Example code includes:
prob_to_draw_ace <- 4 / 52
prob_to_draw_four_aces <- (4 / 52) * (3 / 51) * (2 / 50) * (1 / 49)
n_ads_shown <- 100
proportion_clicks <- 0.1
n_visitors <- rbinom(n = 99999,
size = n_ads_shown, prob = proportion_clicks)
prob_13_visitors <- sum(n_visitors == 13) / length(n_visitors)
prob_13_visitors
## [1] 0.07536075
prob_13_visitors <- dbinom(x=13, size=n_ads_shown, prob=proportion_clicks)
prob_13_visitors
## [1] 0.07430209
n_ads_shown <- 100
proportion_clicks <- 0.1
n_visitors <- 0:n_ads_shown
prob <- dbinom(n_visitors,
size = n_ads_shown, prob = proportion_clicks)
prob
## [1] 2.656140e-05 2.951267e-04 1.623197e-03 5.891602e-03 1.587460e-02
## [6] 3.386580e-02 5.957873e-02 8.889525e-02 1.148230e-01 1.304163e-01
## [11] 1.318653e-01 1.198776e-01 9.878801e-02 7.430209e-02 5.130383e-02
## [16] 3.268244e-02 1.929172e-02 1.059153e-02 5.426525e-03 2.602193e-03
## [21] 1.170987e-03 4.956559e-04 1.977617e-04 7.451890e-05 2.656461e-05
## [26] 8.972934e-06 2.875940e-06 8.758007e-07 2.537042e-07 6.998736e-08
## [31] 1.840408e-08 4.617512e-09 1.106279e-09 2.532895e-10 5.545880e-11
## [36] 1.161994e-11 2.331161e-12 4.480309e-13 8.253201e-14 1.457830e-14
## [41] 2.470212e-15 4.016606e-16 6.269305e-17 9.395858e-18 1.352434e-18
## [46] 1.870032e-19 2.484342e-20 3.171501e-21 3.890962e-22 4.587982e-23
## [51] 5.199713e-24 5.664175e-25 5.930440e-26 5.967739e-27 5.771270e-28
## [56] 5.363200e-29 4.788572e-30 4.107157e-31 3.383290e-32 2.676049e-33
## [61] 2.031815e-34 1.480375e-35 1.034671e-36 6.934301e-38 4.454325e-39
## [66] 2.741123e-40 1.615140e-41 9.106926e-43 4.910597e-44 2.530420e-45
## [71] 1.245128e-46 5.845669e-48 2.616117e-49 1.114936e-50 4.520010e-52
## [76] 1.741041e-53 6.363454e-55 2.203794e-56 7.220406e-58 2.234162e-59
## [81] 6.516307e-61 1.787738e-62 4.602579e-64 1.109055e-65 2.493907e-67
## [86] 5.216014e-69 1.010856e-70 1.807405e-72 2.966699e-74 4.444493e-76
## [91] 6.035732e-78 7.369636e-80 8.010474e-82 7.656367e-84 6.335055e-86
## [96] 4.445653e-88 2.572716e-90 1.178793e-92 4.009500e-95 9.000000e-98
## [101] 1.000000e-100
plot(x=n_visitors, y=prob, type="h")
n_ads_shown <- 100
proportion_clicks <- seq(0, 1, by = 0.01)
n_visitors <- 13
prob <- dbinom(n_visitors,
size = n_ads_shown, prob = proportion_clicks)
prob
## [1] 0.000000e+00 2.965956e-11 1.004526e-07 8.009768e-06 1.368611e-04
## [6] 1.001075e-03 4.265719e-03 1.247940e-02 2.764481e-02 4.939199e-02
## [11] 7.430209e-02 9.703719e-02 1.125256e-01 1.178532e-01 1.129620e-01
## [16] 1.001234e-01 8.274855e-02 6.419966e-02 4.701652e-02 3.265098e-02
## [21] 2.158348e-02 1.362418e-02 8.234325e-03 4.775927e-03 2.663369e-03
## [26] 1.430384e-03 7.408254e-04 3.704422e-04 1.790129e-04 8.366678e-05
## [31] 3.784500e-05 1.657584e-05 7.032793e-06 2.891291e-06 1.151996e-06
## [36] 4.448866e-07 1.665302e-07 6.041614e-08 2.124059e-08 7.234996e-09
## [41] 2.386939e-09 7.624614e-10 2.357105e-10 7.048636e-11 2.037726e-11
## [46] 5.691404e-12 1.534658e-12 3.991862e-13 1.000759e-13 2.415778e-14
## [51] 5.609229e-15 1.251336e-15 2.678760e-16 5.495443e-17 1.078830e-17
## [56] 2.023515e-18 3.620178e-19 6.166397e-20 9.980560e-21 1.531703e-21
## [61] 2.223762e-22 3.046572e-23 3.927965e-24 4.752038e-25 5.377247e-26
## [66] 5.671478e-27 5.554432e-28 5.030231e-29 4.193404e-30 3.201904e-31
## [71] 2.227032e-32 1.402449e-33 7.942805e-35 4.015572e-36 1.797200e-37
## [76] 7.054722e-39 2.403574e-40 7.024314e-42 1.737424e-43 3.582066e-45
## [81] 6.048981e-47 8.199196e-49 8.713462e-51 7.062754e-53 4.226413e-55
## [86] 1.795925e-57 5.170371e-60 9.521923e-63 1.044590e-65 6.239308e-69
## [91] 1.807405e-72 2.180415e-76 8.911963e-81 9.240821e-86 1.591196e-91
## [96] 2.358848e-98 1.001493e-106 1.546979e-117 8.461578e-133 6.239651e-159
## [101] 0.000000e+00
plot(x=proportion_clicks, y=prob, type="h")
n_ads_shown <- 100
proportion_clicks <- seq(0, 1, by = 0.01)
n_visitors <- seq(0, 100, by = 1)
pars <- expand.grid(proportion_clicks = proportion_clicks,
n_visitors = n_visitors)
pars$prior <- dunif(pars$proportion_clicks, min = 0, max = 0.2)
pars$likelihood <- dbinom(pars$n_visitors,
size = n_ads_shown, prob = pars$proportion_clicks)
pars$probability <- pars$likelihood * pars$prior
pars$probability <- pars$probability / sum(pars$probability)
pars_conditioned <- pars[pars$n_visitors==6, ]
pars_conditioned$probability <- pars_conditioned$probability / sum(pars_conditioned$probability)
plot(x=pars_conditioned$proportion_clicks, y=pars_conditioned$probability, type="h")
# Simplify slightly for a known result of 6
n_ads_shown <- 100
proportion_clicks <- seq(0, 1, by = 0.01)
n_visitors <- 6
pars <- expand.grid(proportion_clicks = proportion_clicks,
n_visitors = n_visitors)
pars$prior <- dunif(pars$proportion_clicks, min = 0, max = 0.2)
pars$likelihood <- dbinom(pars$n_visitors,
size = n_ads_shown, prob = pars$proportion_clicks)
pars$probability <- pars$likelihood * pars$prior
pars$probability <- pars$probability / sum(pars$probability)
plot(pars$proportion_clicks, pars$probability, type = "h")
Chapter 5 - More Parameters, Data, and Bayes
Temperature in a normal lake:
Bayesian model of water temperature:
likelihoods <- dnorm(temp, pars$mu[i], pars$sigma[i]) pars$likelihood[i] <- prod(likelihoods) Beach party implications of water temperatures:
Practical tool (BEST):
Wrap and up and next steps:
Example code includes:
mu <- 3500
sigma <- 600
weight_distr <- rnorm(n = 100000, mean = mu, sd = sigma)
hist(weight_distr, xlim = c(0, 6000), col = "lightgreen")
mu <- 3500
sigma <- 600
weight <- seq(0, 6000, by=100)
likelihood <- dnorm(weight, mean=mu, sd=sigma)
plot(x=weight, y=likelihood, type="h")
# The IQ of a bunch of zombies
iq <- c(55, 44, 34, 18, 51, 40, 40, 49, 48, 46)
# Defining the parameter grid
pars <- expand.grid(mu = seq(0, 150, length.out = 100),
sigma = seq(0.1, 50, length.out = 100))
# Defining and calculating the prior density for each parameter combination
pars$mu_prior <- dnorm(pars$mu, mean = 100, sd = 100)
pars$sigma_prior <- dunif(pars$sigma, min = 0.1, max = 50)
pars$prior <- pars$mu_prior * pars$sigma_prior
# Calculating the likelihood for each parameter combination
for(i in 1:nrow(pars)) {
likelihoods <- dnorm(iq, pars$mu[i], pars$sigma[i])
pars$likelihood[i] <- prod(likelihoods)
}
# Calculating the probability of each parameter combination
pars$probability <- pars$likelihood * pars$prior / sum(pars$likelihood * pars$prior)
lattice::levelplot(probability ~ mu * sigma, data = pars)
head(pars)
## mu sigma mu_prior sigma_prior prior likelihood probability
## 1 0.000000 0.1 0.002419707 0.02004008 4.849113e-05 0 0
## 2 1.515152 0.1 0.002456367 0.02004008 4.922578e-05 0 0
## 3 3.030303 0.1 0.002493009 0.02004008 4.996010e-05 0 0
## 4 4.545455 0.1 0.002529617 0.02004008 5.069373e-05 0 0
## 5 6.060606 0.1 0.002566174 0.02004008 5.142633e-05 0 0
## 6 7.575758 0.1 0.002602661 0.02004008 5.215754e-05 0 0
sample_indices <- sample( nrow(pars), size = 10000,
replace = TRUE, prob = pars$probability)
head(sample_indices)
## [1] 2827 2728 3025 3126 4035 4727
pars_sample <- pars[sample_indices, c("mu", "sigma")]
hist(pars_sample$mu)
quantile(pars_sample$mu, c(0.025, 0.5, 0.975))
## 2.5% 50% 97.5%
## 34.84848 42.42424 50.00000
head(pars_sample)
## mu sigma
## 2827 39.39394 14.21313
## 2728 40.90909 13.70909
## 3025 36.36364 15.22121
## 3126 37.87879 15.72525
## 4035 51.51515 20.26162
## 4727 39.39394 23.78990
pred_iq <- rnorm(10000, mean = pars_sample$mu, sd = pars_sample$sigma)
hist(pred_iq)
mean(pred_iq >= 60)
## [1] 0.0886
# The IQ of zombies on a regular diet and a brain based diet.
iq_brains <- c(44, 52, 42, 66, 53, 42, 55, 57, 56, 51)
iq_regular <- c(55, 44, 34, 18, 51, 40, 40, 49, 48, 46)
mean(iq_brains) - mean(iq_regular)
## [1] 9.3
# Need to load http://www.sourceforge.net/projects/mcmc-jags/files for rjags (called by BEST)
# library(BEST)
# best_posterior <- BESTmcmc(iq_brains, iq_regular)
# plot(best_posterior)
Chapter 1 - Introduction to Factor Variables
Introduction to qualitative variables:
Understanding your qualitative variables:
Making better plots:
Example code includes:
multiple_choice_answers <- readr::read_csv("./RInputFiles/smc_with_js.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## Age = col_double()
## )
## See spec(...) for full column specifications.
# Print out the dataset
glimpse(multiple_choice_answers)
## Observations: 16,716
## Variables: 47
## $ LearningPlatformUsefulnessArxiv <chr> NA, NA, "Very useful", ...
## $ LearningPlatformUsefulnessBlogs <chr> NA, NA, NA, "Very usefu...
## $ LearningPlatformUsefulnessCollege <chr> NA, NA, "Somewhat usefu...
## $ LearningPlatformUsefulnessCompany <chr> NA, NA, NA, NA, NA, NA,...
## $ LearningPlatformUsefulnessConferences <chr> "Very useful", NA, NA, ...
## $ LearningPlatformUsefulnessFriends <chr> NA, NA, NA, "Very usefu...
## $ LearningPlatformUsefulnessKaggle <chr> NA, "Somewhat useful", ...
## $ LearningPlatformUsefulnessNewsletters <chr> NA, NA, NA, NA, NA, NA,...
## $ LearningPlatformUsefulnessCommunities <chr> NA, NA, NA, NA, NA, NA,...
## $ LearningPlatformUsefulnessDocumentation <chr> NA, NA, NA, "Very usefu...
## $ LearningPlatformUsefulnessCourses <chr> NA, NA, "Very useful", ...
## $ LearningPlatformUsefulnessProjects <chr> NA, NA, NA, "Very usefu...
## $ LearningPlatformUsefulnessPodcasts <chr> "Very useful", NA, NA, ...
## $ LearningPlatformUsefulnessSO <chr> NA, NA, NA, NA, NA, "Ve...
## $ LearningPlatformUsefulnessTextbook <chr> NA, NA, NA, NA, "Somewh...
## $ LearningPlatformUsefulnessTradeBook <chr> "Somewhat useful", NA, ...
## $ LearningPlatformUsefulnessTutoring <chr> NA, NA, NA, NA, NA, NA,...
## $ LearningPlatformUsefulnessYouTube <chr> NA, NA, "Very useful", ...
## $ CurrentJobTitleSelect <chr> "DBA/Database Engineer"...
## $ MLMethodNextYearSelect <chr> "Random Forests", "Rand...
## $ WorkChallengeFrequencyPolitics <chr> "Rarely", NA, NA, "Ofte...
## $ WorkChallengeFrequencyUnusedResults <chr> NA, NA, NA, "Often", "S...
## $ WorkChallengeFrequencyUnusefulInstrumenting <chr> NA, NA, NA, "Often", NA...
## $ WorkChallengeFrequencyDeployment <chr> NA, NA, NA, "Often", NA...
## $ WorkChallengeFrequencyDirtyData <chr> NA, NA, NA, "Often", NA...
## $ WorkChallengeFrequencyExplaining <chr> NA, NA, NA, "Often", NA...
## $ WorkChallengeFrequencyPass <chr> NA, NA, NA, NA, NA, NA,...
## $ WorkChallengeFrequencyIntegration <chr> NA, NA, NA, "Often", NA...
## $ WorkChallengeFrequencyTalent <chr> NA, NA, NA, "Often", "S...
## $ WorkChallengeFrequencyDataFunds <chr> NA, NA, NA, "Often", "S...
## $ WorkChallengeFrequencyDomainExpertise <chr> NA, NA, NA, "Most of th...
## $ WorkChallengeFrequencyML <chr> NA, NA, NA, "Often", NA...
## $ WorkChallengeFrequencyTools <chr> NA, NA, NA, "Often", NA...
## $ WorkChallengeFrequencyExpectations <chr> NA, NA, NA, "Often", NA...
## $ WorkChallengeFrequencyITCoordination <chr> NA, NA, NA, NA, "Someti...
## $ WorkChallengeFrequencyHiringFunds <chr> NA, NA, NA, "Often", NA...
## $ WorkChallengeFrequencyPrivacy <chr> "Often", NA, NA, "Often...
## $ WorkChallengeFrequencyScaling <chr> "Most of the time", NA,...
## $ WorkChallengeFrequencyEnvironments <chr> NA, NA, NA, "Often", "S...
## $ WorkChallengeFrequencyClarity <chr> NA, NA, NA, "Often", NA...
## $ WorkChallengeFrequencyDataAccess <chr> NA, NA, NA, "Often", NA...
## $ WorkChallengeFrequencyOtherSelect <chr> NA, NA, NA, NA, NA, NA,...
## $ WorkInternalVsExternalTools <chr> "Do not know", NA, NA, ...
## $ FormalEducation <chr> "Bachelor's degree", "M...
## $ Age <dbl> NA, 30, 28, 56, 38, 46,...
## $ DataScienceIdentitySelect <chr> "Yes", "Yes", "Yes", "Y...
## $ JobSatisfaction <chr> "5", NA, NA, "10 - High...
# Check if CurrentJobTitleSelect is a factor
is.factor(multiple_choice_answers$CurrentJobTitleSelect)
## [1] FALSE
# mutate() and summarise() in dplyr both have variants where you can add the suffix if, all, or at to change the operation
# mutate_if() applies a function to all columns where the first argument is true
# mutate_all() applies a function to all columns
# mutate_at() affects columns selected with a character vector or select helpers (e.g. mutate_at(c("height", "weight"), log))
# Change all the character columns to factors
responses_as_factors <- multiple_choice_answers %>%
mutate_if(is.character, as.factor)
# Make a two column dataset with variable names and number of levels
number_of_levels <- responses_as_factors %>%
summarise_all(nlevels) %>%
gather(variable, num_levels)
# dplyr has two other functions that can come in handy when exploring a dataset
# The first is top_n(x, var), which gets us the first x rows of a dataset based on the value of var
# The other is pull(), which allows us to extract a column and take out the name, leaving only the value(s) from the column
# Select the 4 rows with the highest number of levels
number_of_levels %>%
top_n(4, num_levels)
## # A tibble: 4 x 2
## variable num_levels
## <chr> <int>
## 1 CurrentJobTitleSelect 16
## 2 MLMethodNextYearSelect 25
## 3 FormalEducation 7
## 4 JobSatisfaction 11
# How many levels does CurrentJobTitleSelect have?
number_of_levels %>%
filter(variable=="CurrentJobTitleSelect") %>%
pull(num_levels)
## [1] 16
# Get the names of the levels of CurrentJobTitleSelect
responses_as_factors %>%
pull(CurrentJobTitleSelect) %>%
levels()
## [1] "Business Analyst"
## [2] "Computer Scientist"
## [3] "Data Analyst"
## [4] "Data Miner"
## [5] "Data Scientist"
## [6] "DBA/Database Engineer"
## [7] "Engineer"
## [8] "Machine Learning Engineer"
## [9] "Operations Research Practitioner"
## [10] "Other"
## [11] "Predictive Modeler"
## [12] "Programmer"
## [13] "Researcher"
## [14] "Scientist/Researcher"
## [15] "Software Developer/Software Engineer"
## [16] "Statistician"
# Make a bar plot
ggplot(multiple_choice_answers, aes(x=FormalEducation)) +
geom_bar() +
coord_flip()
# Make a bar plot
ggplot(multiple_choice_answers, aes(x=fct_rev(fct_infreq(FormalEducation)))) +
geom_bar() +
coord_flip()
multiple_choice_answers %>%
filter(!is.na(Age) & !is.na(FormalEducation)) %>%
group_by(FormalEducation) %>%
summarize(mean_age = mean(Age)) %>%
ggplot(aes(x = fct_reorder(FormalEducation, mean_age), y = mean_age)) +
geom_point() +
coord_flip()
Chapter 2 - Manipulating Factor Variables
Reordering factors:
Renaming factor levels:
"Other" = "Other (please specify)", "Everyone should share" = "The arm rests should be shared", "Aisle and window people" = "The people in the aisle and window seats get both arm rests", "Middle person" = "The person in the middle seat gets both arm rests", "Fastest person" = "Whoever puts their arm on the arm rest first") Collapsing factor levels:
Example code includes:
multiple_choice_responses <- multiple_choice_answers
# Print the levels of WorkInternalVsExternalTools
levels(multiple_choice_responses$WorkInternalVsExternalTools)
## NULL
# Reorder the levels from internal to external
mc_responses_reordered <- multiple_choice_responses %>%
mutate(WorkInternalVsExternalTools = fct_relevel(WorkInternalVsExternalTools,
c('Entirely internal', 'More internal than external',
'Approximately half internal and half external',
'More external than internal', 'Entirely external',
'Do not know'
)
)
)
# Make a bar plot of the responses
ggplot(mc_responses_reordered, aes(x=WorkInternalVsExternalTools)) +
geom_bar() +
coord_flip()
multiple_choice_responses %>%
# Move "I did not complete any formal education past high school" and "Some college/university study without earning a bachelor's degree" to the front
mutate(FormalEducation = fct_relevel(FormalEducation, c("I did not complete any formal education past high school", "Some college/university study without earning a bachelor's degree"))) %>%
# Move "Doctoral degree" to be the sixth level
mutate(FormalEducation = fct_relevel(FormalEducation, after=6, "Doctoral degree")) %>%
# Move "I prefer not to answer" to be the last level.
mutate(FormalEducation = fct_relevel(FormalEducation, after=Inf, "I prefer not to answer")) %>%
# Examine the new level order
pull(FormalEducation) %>%
levels()
## [1] "I did not complete any formal education past high school"
## [2] "Some college/university study without earning a bachelor's degree"
## [3] "Bachelor's degree"
## [4] "Master's degree"
## [5] "Professional degree"
## [6] "Doctoral degree"
## [7] "I prefer not to answer"
# make a bar plot of the frequency of FormalEducation
ggplot(multiple_choice_responses, aes(x=FormalEducation)) +
geom_bar()
multiple_choice_responses %>%
# rename levels
mutate(FormalEducation = fct_recode(FormalEducation, "High school" ="I did not complete any formal education past high school", "Some college" = "Some college/university study without earning a bachelor's degree")) %>%
# make a bar plot of FormalEducation
ggplot(aes(x=FormalEducation)) +
geom_bar()
multiple_choice_responses %>%
# Create new variable, grouped_titles, by collapsing levels in CurrentJobTitleSelect
mutate(grouped_titles = fct_collapse(CurrentJobTitleSelect,
"Computer Scientist" = c("Programmer", "Software Developer/Software Engineer"),
"Researcher" = "Scientist/Researcher",
"Data Analyst/Scientist/Engineer" = c("DBA/Database Engineer", "Data Scientist",
"Business Analyst", "Data Analyst",
"Data Miner", "Predictive Modeler"))) %>%
# Turn every title that isn't now one of the grouped_titles into "Other"
mutate(grouped_titles = fct_other(grouped_titles,
keep = c("Computer Scientist",
"Researcher",
"Data Analyst/Scientist/Engineer"))) %>%
# Get a count of the grouped titles
count(grouped_titles)
## Warning: Factor `grouped_titles` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 5 x 2
## grouped_titles n
## <fct> <int>
## 1 Data Analyst/Scientist/Engineer 4928
## 2 Computer Scientist 2556
## 3 Researcher 1597
## 4 Other 2749
## 5 <NA> 4886
multiple_choice_responses %>%
# remove NAs of MLMethodNextYearSelect
filter(!is.na(MLMethodNextYearSelect)) %>%
# create ml_method, which lumps all those with less than 5% of people into "Other"
mutate(ml_method = fct_lump(MLMethodNextYearSelect, prop=0.05)) %>%
# print the frequency of your new variable in descending order
count(ml_method, sort=TRUE)
## # A tibble: 4 x 2
## ml_method n
## <fct> <int>
## 1 Other 4405
## 2 Deep learning 4362
## 3 Neural Nets 1386
## 4 Time Series Analysis 680
multiple_choice_responses %>%
# remove NAs
filter(!is.na(MLMethodNextYearSelect)) %>%
# create ml_method, retaining the 5 most common methods and renaming others "other method"
mutate(ml_method = fct_lump(MLMethodNextYearSelect, 5, other_level="other method")) %>%
# print the frequency of your new variable in descending order
count(ml_method, sort=TRUE)
## # A tibble: 6 x 2
## ml_method n
## <fct> <int>
## 1 Deep learning 4362
## 2 other method 3401
## 3 Neural Nets 1386
## 4 Time Series Analysis 680
## 5 Bayesian Methods 511
## 6 Text Mining 493
Chapter 3 - Creating Factor Variables
Examining common themed variables:
mutate(work_challenge = str_remove(work_challenge, "WorkChallengeFrequency")) # will remove the string "WorkChallengeFrequency" from column work_challenge Tricks of ggplot2:
Changing and creating variables with dplyr::case_when():
Example code includes:
learning_platform_usefulness <- multiple_choice_responses %>%
# select columns with LearningPlatformUsefulness in title
select(contains("LearningPlatformUsefulness")) %>%
# change data from wide to long
gather(learning_platform, usefulness) %>%
# remove rows where usefulness is NA
filter(!is.na(usefulness)) %>%
# remove "LearningPlatformUsefulness" from each string in `learning_platform
mutate(learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness"))
learning_platform_usefulness %>%
# change dataset to one row per learning_platform usefulness pair with number of entries for each
count(learning_platform, usefulness) %>%
# use add_count to create column with total number of answers for that learning_platform
add_count(learning_platform, wt=n, name="nn") %>%
# create a line graph for each question with usefulness on x-axis and percentage of responses on y
ggplot(aes(x = usefulness, y = n/nn, group = learning_platform)) +
geom_line() +
facet_wrap(~ learning_platform)
avg_usefulness <- learning_platform_usefulness %>%
# If usefulness is "Not Useful", make 0, else 1
mutate(usefulness = ifelse(usefulness=="Not Useful", 0, 1)) %>%
# Get the average usefulness by learning platform
group_by(learning_platform) %>%
summarize(avg_usefulness = mean(usefulness))
# Make a scatter plot of average usefulness by learning platform
ggplot(avg_usefulness, aes(x=learning_platform, y=avg_usefulness)) +
geom_point()
ggplot(avg_usefulness, aes(x = learning_platform, y = avg_usefulness)) +
geom_point() +
# rotate x-axis text by 90 degrees
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
# rename y and x axis labels
labs(x="Learning Platform", y="Percent finding at least somewhat useful") +
# change y axis scale to percentage
scale_y_continuous(labels = scales::percent)
ggplot(avg_usefulness,
aes(x = fct_rev(fct_reorder(learning_platform, avg_usefulness)), y = avg_usefulness)
) +
geom_point() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "Learning Platform", y = "Percent finding at least somewhat useful") +
scale_y_continuous(labels = scales::percent)
# Check the min age
min(multiple_choice_responses$Age, na.rm=TRUE)
## [1] 0
# Check the max age
max(multiple_choice_responses$Age, na.rm=TRUE)
## [1] 100
sum(is.na(multiple_choice_responses$Age))
## [1] 331
multiple_choice_responses %>%
# Eliminate any ages below 10 and above 90
filter(between(Age, 10, 90)) %>%
# Create the generation variable based on age
mutate(generation=case_when(
between(Age, 10, 22) ~ "Gen Z",
between(Age, 23, 37) ~ "Gen Y",
between(Age, 38, 52) ~ "Gen X",
between(Age, 53, 71) ~ "Baby Boomer",
between(Age, 72, 90) ~ "Silent"
)) %>%
# Get a count of how many answers in each generation
count(generation)
## # A tibble: 5 x 2
## generation n
## <chr> <int>
## 1 Baby Boomer 832
## 2 Gen X 3162
## 3 Gen Y 10281
## 4 Gen Z 2037
## 5 Silent 37
multiple_choice_responses %>%
# Filter out people who selected Data Scientist as their Job Title
filter(!is.na(CurrentJobTitleSelect) & CurrentJobTitleSelect != "Data Scientist") %>%
# Create a new variable, job_identity
mutate(job_identity = case_when(
CurrentJobTitleSelect == "Data Analyst" & DataScienceIdentitySelect == "Yes" ~ "DS analysts",
CurrentJobTitleSelect == "Data Analyst" & DataScienceIdentitySelect %in% c("No", "Sort of (Explain more)") ~ "NDS analyst",
CurrentJobTitleSelect != "Data Analyst" & DataScienceIdentitySelect == "Yes" ~ "DS non-analysts",
TRUE ~ "NDS non analysts")
) %>%
mutate(JobSat=case_when(
is.na(JobSatisfaction) ~ NA_integer_,
JobSatisfaction == "I prefer not to share" | JobSatisfaction == "NA" ~ NA_integer_,
JobSatisfaction == "1 - Highly Dissatisfied" ~ 1L,
JobSatisfaction == "10 - Highly Satisfied" ~ 10L,
TRUE ~ as.integer(JobSatisfaction))) %>%
# Get the average job satisfaction by job_identity, removing NAs
group_by(job_identity) %>%
summarize(avg_js = mean(JobSat, na.rm=TRUE))
## Warning in eval_tidy(pair$rhs, env = default_env): NAs introduced by coercion
## # A tibble: 4 x 2
## job_identity avg_js
## <chr> <dbl>
## 1 DS analysts 6.44
## 2 DS non-analysts 6.93
## 3 NDS analyst 6.14
## 4 NDS non analysts 6.43
Chapter 4 - Case Study on Flight Etiquette
Case study introduction:
Data preparation and regex:
Recreating the plot:
End of course recap:
Example code includes:
flying_etiquette <- read.csv("./RInputFiles/flying-etiquette.csv", stringsAsFactors = FALSE)
names(flying_etiquette) <-
stringr::str_replace_all(stringr::str_replace_all(names(flying_etiquette), "\\.", " "), " ", " ")
names(flying_etiquette) <- stringr::str_trim(names(flying_etiquette))
names(flying_etiquette)[2:22] <- paste0(names(flying_etiquette)[2:22], "?")
names(flying_etiquette) <- stringr::str_replace_all(names(flying_etiquette), "itrude", "it rude")
glimpse(flying_etiquette)
## Observations: 1,040
## Variables: 27
## $ RespondentID <dbl> ...
## $ `How often do you travel by plane?` <chr> ...
## $ `Do you ever recline your seat when you fly?` <chr> ...
## $ `How tall are you?` <chr> ...
## $ `Do you have any children under 18?` <chr> ...
## $ `In a row of three seats who should get to use the two arm rests?` <chr> ...
## $ `In a row of two seats who should get to use the middle arm rest?` <chr> ...
## $ `Who should have control over the window shade?` <chr> ...
## $ `Is it rude to move to an unsold seat on a plane?` <chr> ...
## $ `Generally speaking is it rude to say more than a few words tothe stranger sitting next to you on a plane?` <chr> ...
## $ `On a 6 hour flight from NYC to LA how many times is it acceptable to get up if you re not in an aisle seat?` <chr> ...
## $ `Under normal circumstances does a person who reclines their seat during a flight have any obligation to the person sitting behind them?` <chr> ...
## $ `Is it rude to recline your seat on a plane?` <chr> ...
## $ `Given the opportunity would you eliminate the possibility of reclining seats on planes entirely?` <chr> ...
## $ `Is it rude to ask someone to switch seats with you in order to be closer to friends?` <chr> ...
## $ `Is it rude to ask someone to switch seats with you in order to be closer to family?` <chr> ...
## $ `Is it rude to wake a passenger up if you are trying to go to the bathroom?` <chr> ...
## $ `Is it rude to wake a passenger up if you are trying to walk around?` <chr> ...
## $ `In general is it rude to bring a baby on a plane?` <chr> ...
## $ `In general is it rude to knowingly bring unruly children on a plane?` <chr> ...
## $ `Have you ever used personal electronics during take off or landing in violation of a flight attendant s direction?` <chr> ...
## $ `Have you ever smoked a cigarette in an airplane bathroom when it was against the rules?` <chr> ...
## $ Gender <chr> ...
## $ Age <chr> ...
## $ `Household Income` <chr> ...
## $ Education <chr> ...
## $ `Location Census Region` <chr> ...
gathered_data <- flying_etiquette %>%
mutate_if(is.character, as.factor) %>%
filter(`How often do you travel by plane?` != "Never") %>%
# Select columns containing "rude"
select(contains("rude")) %>%
# Change format from wide to long
gather(response_var, value)
## Warning: attributes are not identical across measure variables;
## they will be dropped
rude_behaviors <- gathered_data %>%
mutate(response_var = str_replace(response_var, '.*rude to ', '')) %>%
mutate(response_var = str_replace(response_var, 'on a plane', '')) %>%
mutate(rude = if_else(value %in% c("No, not rude at all", "No, not at all rude"), 0, 1)) %>%
# Create perc_rude, the percent considering each behavior rude
group_by(response_var) %>%
summarize(perc_rude=mean(rude))
rude_behaviors
## # A tibble: 9 x 2
## response_var perc_rude
## <chr> <dbl>
## 1 ask someone to switch seats with you in order to be closer to famil~ 0.193
## 2 ask someone to switch seats with you in order to be closer to frien~ 0.278
## 3 bring a baby ? 0.323
## 4 knowingly bring unruly children ? 0.832
## 5 move to an unsold seat ? 0.211
## 6 recline your seat ? 0.426
## 7 say more than a few words tothe stranger sitting next to you ? 0.228
## 8 wake a passenger up if you are trying to go to the bathroom? 0.388
## 9 wake a passenger up if you are trying to walk around? 0.741
# Create an ordered by plot of behavior by percentage considering it rude
initial_plot <- ggplot(rude_behaviors, aes(x=fct_reorder(response_var, perc_rude), y=perc_rude)) +
geom_col()
# View your plot
initial_plot
titled_plot <- initial_plot +
# Add the title, subtitle, and caption
labs(title = "Hell Is Other People In A Pressurized Metal Tube",
subtitle = "Percentage of 874 air-passenger respondents who said action is very or somewhat rude",
caption = "Source: SurveyMonkey Audience",
# Remove the x- and y-axis labels
x="",
y=""
)
titled_plot
flipped_plot <- titled_plot +
# Flip the axes
coord_flip() +
# Remove the x-axis ticks and labels
theme(axis.ticks.x = element_blank(),
axis.text.x = element_blank())
flipped_plot +
# Add labels above the bar with the perc value
geom_text(aes(label = paste0(round(100*perc_rude), "%"), y = perc_rude + .03),
position = position_dodge(0.9), vjust = 1)
Chapter 1 - Introduction to Bayesian Modeling
Prior model:
Data and likelihood:
Posterior model:
# Likelihood model for XX ~ dbin(p, n) # order of n and p are reversed (known difference for RJAGS)# Prior model for pp ~ dbeta(a, b)Example code includes:
# Make sure you have installed JAGS-4.x.y.exe (for any x >=0, y>=0) from http://www.sourceforge.net/projects/mcmc-jags/files
# Sample 10000 draws from Beta(45,55) prior
prior_A <- rbeta(n = 10000, shape1 = 45, shape2 = 55)
# Store the results in a data frame
prior_sim <- data.frame(prior_A)
# Construct a density plot of the prior sample
ggplot(prior_sim, aes(x = prior_A)) +
geom_density()
# Sample 10000 draws from the Beta(1,1) prior
prior_B <- rbeta(n = 10000, shape1 = 1, shape2 = 1)
# Sample 10000 draws from the Beta(100,100) prior
prior_C <- rbeta(n = 10000, shape1 = 100, shape2 = 100)
# Combine the results in a single data frame
prior_sim <- data.frame(samples = c(prior_A, prior_B, prior_C),
priors = rep(c("A","B","C"), each = 10000))
# Plot the 3 priors
ggplot(prior_sim, aes(x = samples, fill = priors)) +
geom_density(alpha = 0.5)
# Define a vector of 1000 p values
p_grid <- seq(0, 1, length.out=1000)
# Simulate 1 poll result for each p in p_grid
poll_result <- rbinom(1000, 10, prob=p_grid)
# Create likelihood_sim data frame
likelihood_sim <- data.frame(p_grid, poll_result)
# Density plots of p_grid grouped by poll_result
ggplot(likelihood_sim, aes(x = p_grid, y = poll_result, group = poll_result)) +
ggridges::geom_density_ridges()
# Density plots of p_grid grouped by poll_result
ggplot(likelihood_sim, aes(x = p_grid, y = poll_result, group = poll_result, fill = poll_result==6)) +
ggridges::geom_density_ridges()
# Keep the polls with X = 6
likelihood_sim_6 <- likelihood_sim %>%
filter(poll_result==6)
# Construct a density plot of the remaining p_grid values
ggplot(likelihood_sim_6, aes(x = p_grid)) +
geom_density() +
lims(x = c(0,1))
# DEFINE the model
vote_model <- "model{
# Likelihood model for X
X ~ dbin(p, n)
# Prior model for p
p ~ dbeta(a, b)
}"
# COMPILE the model
vote_jags <- jags.model(textConnection(vote_model),
data = list(a = 45, b = 55, X = 6, n = 10),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 100))
# SIMULATE the posterior
vote_sim <- coda.samples(model = vote_jags, variable.names = c("p"), n.iter = 10000)
# PLOT the posterior
plot(vote_sim, trace = FALSE)
# COMPILE the model
vote_jags <- jags.model(textConnection(vote_model),
data = list(a = 1, b = 1, X = 6, n = 10),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 100))
# SIMULATE the posterior
vote_sim <- coda.samples(model = vote_jags, variable.names = c("p"), n.iter = 10000)
# PLOT the posterior
plot(vote_sim, trace = FALSE, xlim = c(0,1), ylim = c(0,18))
# COMPILE the model
vote_jags <- jags.model(textConnection(vote_model),
data = list(a = 1, b = 1, X = 220, n = 400),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 100))
# SIMULATE the posterior
vote_sim <- coda.samples(model = vote_jags, variable.names = c("p"), n.iter = 10000)
# PLOT the posterior
plot(vote_sim, trace = FALSE, xlim = c(0,1), ylim = c(0,18))
# COMPILE the model
vote_jags <- jags.model(textConnection(vote_model),
data = list(a = 45, b = 55, X = 220, n = 400),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 100))
# SIMULATE the posterior
vote_sim <- coda.samples(model = vote_jags, variable.names = c("p"), n.iter = 10000)
# PLOT the posterior
plot(vote_sim, trace = FALSE, xlim = c(0,1), ylim = c(0,18))
Chapter 2 - Bayesian Models and Markov Chains
Normal-Normal Model:
Simulating Normal-Normal in RJAGS:
# Likelihood model for Y[i]for(i in 1:length(Y)) { Y[i] ~ dnorm(m, s^(-2)) # requires precision which can be defined as the inverse of sigma-squared } # Prior models for m and s m ~ dnorm(50, 25^(-2)) # requires precision which can be defined as the inverse of sigma-squared s ~ dunif(0, 200) Markov chains:
Markov chain diagnostics and reproducibility:
Example code includes:
# Take 10000 samples from the m prior
prior_m <- rnorm(10000, 50, 25)
# Take 10000 samples from the s prior
prior_s <- runif(10000, 0, 200)
# Store samples in a data frame
samples <- data.frame(prior_m, prior_s)
# Density plots of the prior_m & prior_s samples
ggplot(samples, aes(x = prior_m)) +
geom_density()
ggplot(samples, aes(x = prior_s)) +
geom_density()
# Check out the first 6 rows of sleep_study
head(sleep_study)
# Define diff_3
sleep_study <- sleep_study %>%
mutate(diff_3=day_3-day_0)
# Histogram of diff_3
ggplot(sleep_study, aes(x = diff_3)) +
geom_histogram(binwidth = 20, color = "white")
# Mean and standard deviation of diff_3
sleep_study %>%
summarize(mean(diff_3), sd(diff_3))
# DEFINE the model
sleep_model <- "model{
# Likelihood model for Y[i]
for(i in 1:length(Y)) {
Y[i] ~ dnorm(m, s^(-2))
}
# Prior models for m and s
m ~ dnorm(50, 25^(-2))
s ~ dunif(0, 200)
}"
# COMPILE the model
sleep_jags <- jags.model(textConnection(sleep_model), data = list(Y = sleep_study$diff_3),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 1989))
# SIMULATE the posterior
sleep_sim <- coda.samples(model = sleep_jags, variable.names = c("m", "s"), n.iter = 10000)
# PLOT the posterior
plot(sleep_sim, trace = FALSE)
# Let m be the average change in reaction time after 3 days of sleep deprivation
# In a previous exercise, you obtained an approximate sample of 10,000 draws from the posterior model of m
# You stored the resulting mcmc.list object as sleep_sim which is loaded in your workspace:
# In fact, the sample of m values in sleep_sim is a dependent Markov chain, the distribution of which converges to the posterior
# You will examine the contents of sleep_sim and, to have finer control over your analysis, store the contents in a data frame
# Check out the head of sleep_sim
head(sleep_sim)
# Store the chains in a data frame
sleep_chains <- data.frame(sleep_sim[[1]], iter = 1:10000)
# Check out the head of sleep_chains
head(sleep_chains)
# NOTE: The 10,000 recorded Iterations start after a "burn-in" period in which samples are discarded
# Thus the Iterations count doesn't start at 1!
# Use plot() to construct trace plots of the m and s chains
plot(sleep_sim, density=FALSE)
# Use ggplot() to construct a trace plot of the m chain
ggplot(sleep_chains, aes(x = iter, y = m)) +
geom_line()
# Trace plot the first 100 iterations of the m chain
ggplot(dplyr::filter(sleep_chains, iter<=100), aes(x = iter, y = m)) + geom_line()
# Note that the longitudinal behavior of the chain appears quite random and that the trend remains relatively constant
# This is a good thing - it indicates that the Markov chain (likely) converges quickly to the posterior distribution of m
# Use plot() to construct density plots of the m and s chains
plot(sleep_sim, trace=FALSE)
# Use ggplot() to construct a density plot of the m chain
ggplot(sleep_chains, aes(x = m)) +
geom_density()
# Density plot of the first 100 values in the m chain
ggplot(dplyr::filter(sleep_chains, iter<=100), aes(x = m)) +
geom_density()
# COMPILE the model
sleep_jags_multi <- jags.model(textConnection(sleep_model), data = list(Y = sleep_study$diff_3), n.chains=4)
# SIMULATE the posterior
sleep_sim_multi <- coda.samples(model = sleep_jags_multi, variable.names = c("m", "s"), n.iter = 1000)
# Check out the head of sleep_sim_multi
head(sleep_sim_multi)
# Construct trace plots of the m and s chains
plot(sleep_sim_multi, density=FALSE)
# The mean of the m Markov chain provides an estimate of the posterior mean of m
# The naive standard error provides a measure of the estimate's accuracy.
# Suppose your goal is to estimate the posterior mean of m within a standard error of 0.1 ms
# If the observed naive standard error exceeds this target, no problem!
# You can simply run a longer chain
# SIMULATE the posterior
sleep_sim_1 <- coda.samples(model = sleep_jags, variable.names = c("m", "s"), n.iter = 1000)
# Summarize the m and s chains of sleep_sim_1
summary(sleep_sim_1)
# RE-SIMULATE the posterior
sleep_sim_2 <- coda.samples(model = sleep_jags, variable.names = c("m", "s"), n.iter = 10000)
# Summarize the m and s chains of sleep_sim_2
summary(sleep_sim_2)
# COMPILE the model
sleep_jags <- jags.model(textConnection(sleep_model), data = list(Y = sleep_study$diff_3), inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 1989))
# SIMULATE the posterior
sleep_sim <- coda.samples(model = sleep_jags, variable.names = c("m", "s"), n.iter = 10000)
# Summarize the m and s chains of sleep_sim
summary(sleep_sim)
Chapter 3 - Bayesian Inference and Prediction
Simple Bayesian Regression Model:
Bayesian Regression in RJAGS:
# Likelihood model for Y[i] for(i in 1:length(Y)) { Y[i] ~ dnorm(m[i], s^(-2)) m[i] <- a + b * X[i] } # Prior models for a, b, s a ~ dnorm(0, 200^(-2)) b ~ dnorm(1, 0.5^(-2)) s ~ dunif(0, 20) Posterior estimation and inference:
Posterior prediction:
Example code includes:
# Note the 3 parameters in the model of weight by height: intercept a, slope b, & standard deviation s
# In the first step of your Bayesian analysis, you will simulate the following prior models for these parameters: a ~ N(0, 200^2), b ~ N(1, 0.5^2), and s ~ Unif(0, 20)
# Take 10000 samples from the a, b, & s priors
prior_a <- rnorm(10000, 0, 200)
prior_b <- rnorm(10000, 1, 0.5)
prior_s <- runif(10000, 0, 20)
# Store samples in a data frame
samples <- data.frame(prior_a, prior_b, prior_s, set=1:10000)
# Construct density plots of the prior samples
ggplot(samples, aes(x = prior_a)) +
geom_density()
ggplot(samples, aes(x = prior_b)) +
geom_density()
ggplot(samples, aes(x = prior_s)) +
geom_density()
# Replicate the first 12 parameter sets 50 times each
prior_scenarios_rep <- bind_rows(replicate(n = 50, expr = samples[1:12, ], simplify = FALSE))
# Simulate 50 height & weight data points for each parameter set
prior_simulation <- prior_scenarios_rep %>%
mutate(height = rnorm(600, 170, 10)) %>%
mutate(weight = rnorm(600, prior_a + prior_b*height, prior_s))
# Plot the simulated data & regression model for each parameter set
ggplot(prior_simulation, aes(x = height, y = weight)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, size = 0.75) +
facet_wrap(~ set)
# The bdims data set from the openintro package is loaded in your workspace
# bdims contains physical measurements on a sample of 507 individuals, including their weight in kg (wgt) and height in cm (hgt)
# Construct a scatterplot of wgt vs hgt
ggplot(bdims, aes(x = hgt, y = wgt)) +
geom_point()
# Add a model smooth
ggplot(bdims, aes(x = hgt, y = wgt)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# Obtain the sample regression model
wt_model <- lm(wgt ~ hgt, data = bdims)
# Summarize the model
summary(wt_model)
# DEFINE the model
weight_model <- "model{
# Likelihood model for Y[i]
for(i in 1:length(Y)) {
Y[i] ~ dnorm(m[i], s^(-2))
m[i] <- a + b * X[i]
}
# Prior models for a, b, s
a ~ dnorm(0, 200^(-2))
b ~ dnorm(1, 0.5^(-2))
s ~ dunif(0, 20)
}"
# COMPILE the model
weight_jags <- jags.model(textConnection(weight_model), data = list(X=bdims$hgt, Y=bdims$wgt),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 1989))
# COMPILE the model
weight_jags <- jags.model(textConnection(weight_model), data = list(Y = bdims$wgt, X = bdims$hgt),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 1989))
# SIMULATE the posterior
weight_sim <- coda.samples(model = weight_jags, variable.names = c("a", "b", "s"), n.iter = 1000)
# PLOT the posterior
plot(weight_sim)
# A 100,000 iteration RJAGS simulation of the posterior, weight_sim_big, is in your workspace along with a data frame of the Markov chain output:
head(weight_chains, 2)
# The posterior means of the intercept & slope parameters, a & b, reflect the posterior mean trend in the relationship between weight & height
# In contrast, the full posteriors of a & b reflect the range of plausible parameters, thus posterior uncertainty in the trend
# You will examine the trend and uncertainty in this trend below
# The bdims data are in your workspace
# Summarize the posterior Markov chains
summary(weight_sim_big)
# Calculate the estimated posterior mean of b
mean(weight_chains$b)
# Plot the posterior mean regression model
ggplot(bdims, aes(x=hgt, y=wgt)) +
geom_point() +
geom_abline(intercept = mean(weight_chains$a), slope = mean(weight_chains$b), color = "red")
# Visualize the range of 20 posterior regression models
ggplot(bdims, aes(x=hgt, y=wgt)) +
geom_point() +
geom_abline(intercept = weight_chains$a[1:20], slope = weight_chains$b[1:20], color = "gray", size = 0.25)
# Summarize the posterior Markov chains
summary(weight_sim_big)
# Calculate the 95% posterior credible interval for b
quantile(weight_chains$b, c(0.025, 0.975))
# Calculate the 90% posterior credible interval for b
quantile(weight_chains$b, c(0.05, 0.95))
# Mark the 90% credible interval
ggplot(weight_chains, aes(x = b)) +
geom_density() +
geom_vline(xintercept = quantile(weight_chains$b, c(0.05, 0.95)), color = "red")
# Mark 1.1 on a posterior density plot for b
ggplot(weight_chains, aes(x=b)) +
geom_density() +
geom_vline(xintercept = 1.1, color = "red")
# Summarize the number of b chain values that exceed 1.1
table(weight_chains$b > 1.1)
# Calculate the proportion of b chain values that exceed 1.1
mean(weight_chains$b > 1.1)
# Calculate the trend under each Markov chain parameter set
weight_chains <- weight_chains %>%
mutate(m_180 = a + b*180)
# Construct a posterior density plot of the trend
ggplot(weight_chains, aes(x = m_180)) +
geom_density()
# Calculate the average trend
mean(weight_chains$m_180)
# Construct a posterior credible interval for the trend
quantile(weight_chains$m_180, c(0.025, 0.975))
# Simulate 1 prediction under the first parameter set
rnorm(1, mean=weight_chains$m_180[1], sd=weight_chains$s[1])
# Simulate 1 prediction under the second parameter set
rnorm(1, mean=weight_chains$m_180[2], sd=weight_chains$s[2])
# Simulate & store 1 prediction under each parameter set
weight_chains <- weight_chains %>%
mutate(Y_180=rnorm(nrow(weight_chains), mean=m_180, sd=s))
# Print the first 6 parameter sets & predictions
head(weight_chains)
# Construct a density plot of the posterior predictions
ggplot(weight_chains, aes(x=Y_180)) +
geom_density() +
geom_vline(xintercept = quantile(weight_chains$Y_180, c(0.025, 0.975)), color = "red")
# Construct a posterior credible interval for the prediction
quantile(weight_chains$Y_180, c(0.025, 0.975))
# Visualize the credible on a scatterplot of the data
ggplot(bdims, aes(x=hgt, y=wgt)) +
geom_point() +
geom_abline(intercept = mean(weight_chains$a), slope = mean(weight_chains$b), color = "red") +
geom_segment(x = 180, xend = 180, y = quantile(weight_chains$Y_180, c(0.025)), yend = quantile(weight_chains$Y_180, c(0.975)), color = "red")
Chapter 4 - Multivariate and Generalized Linear Models
Bayesian regression with categorical predictor:
# Likelihood model for Y[i] for(i in 1:length(Y)) { Y[i] ~ dnorm(m[i], s^(-2)) m[i] <- a + b[X[i]] } # Prior models for a, b, s a ~ dnorm(400, 100^(-2)) s ~ dunif(0, 200) b[1] <- 0 b[2] ~ dnorm(0, 200^(-2)) Multivariate Bayesian regression:
# Likelihood model for Y[i] for(i in 1:length(Y)) { Y[i] ~ dnorm(m[i], s^(-2)) m[i] <- a + b[X[i]] + c * Z[i] } # Prior models for a, b, c, s a ~ dnorm(0, 200^(-2)) b[1] <- 0 b[2] ~ dnorm(0, 200^(-2)) c ~ dnorm(0, 20^(-2)) s ~ dunif(0, 200) Bayesian Poisson regression:
# Likelihood model for Y[i] for(i in 1:length(Y)) { Y[i] ~ dpois(l[i]) log(l[i]) <- a + b[X[i]] + c*Z[i] } # Prior models for a, b, c a ~ dnorm(0, 200^(-2)) b[1] <- 0 b[2] ~ dnorm(0, 2^(-2)) c ~ dnorm(0, 2^(-2)) Wrap up:
Example code includes:
# Confirm that weekday is a factor variable
is.factor(RailTrail$weekday)
# Construct a density plot of volume by weekday
ggplot(RailTrail, aes(x = volume, fill = weekday)) +
geom_density(alpha = 0.5)
# Calculate the mean volume on weekdays vs weekends
RailTrail %>%
group_by(weekday) %>%
summarize(mean(volume))
# DEFINE the model
rail_model_1 <- "model{
# Likelihood model for Y[i]
for(i in 1:length(Y)) {
Y[i] ~ dnorm(m[i], s^(-2))
m[i] <- a + b[X[i]]
}
# Prior models for a, b, s
a ~ dnorm(400, 100^(-2))
b[1] <- 0
b[2] ~ dnorm(0, 200^(-2))
s ~ dunif(0, 200)
}"
# COMPILE the model
rail_jags_1 <- jags.model(textConnection(rail_model_1),
data = list(Y=RailTrail$volume, X=RailTrail$weekday),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 10)
)
# COMPILE the model
rail_jags_1 <- jags.model(textConnection(rail_model_1), data = list(Y = RailTrail$volume, X = RailTrail$weekday),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 10))
# SIMULATE the posterior
rail_sim_1 <- coda.samples(model = rail_jags_1, variable.names = c("a", "b", "s"), n.iter = 10000)
# Store the chains in a data frame
rail_chains_1 <- data.frame(rail_sim_1[[1]])
# PLOT the posterior
plot(rail_sim_1)
# Posterior probability that typical volume is lower on weekdays
mean(rail_chains_1$'b.2.' < 0)
# Construct a chain of values for the typical weekday volume
rail_chains_1 <- rail_chains_1 %>%
mutate(weekday_mean = a + b.2.)
# Construct a density plot of the weekday chain
ggplot(rail_chains_1, aes(x=weekday_mean)) +
geom_density()
# 95% credible interval for typical weekday volume
quantile(rail_chains_1$weekday_mean, c(0.025, 0.975))
# Construct a plot of volume by hightemp & weekday
ggplot(RailTrail, aes(x=hightemp, y=volume, color=weekday)) +
geom_point()
# Construct a sample model
rail_lm <- lm(volume ~ weekday + hightemp, data=RailTrail)
# Summarize the model
summary(rail_lm)
# Superimpose sample estimates of the model lines
ggplot(RailTrail, aes(x=hightemp, y=volume, color=weekday)) +
geom_point() +
geom_abline(intercept = coef(rail_lm)["(Intercept)"], slope = coef(rail_lm)["hightemp"], color = "red") +
geom_abline(intercept = sum(coef(rail_lm)[c("(Intercept)", "weekdayTRUE")]), slope = coef(rail_lm)["hightemp"], color = "turquoise3")
# DEFINE the model
rail_model_2 <- "model{
# Likelihood model for Y[i]
for(i in 1:length(Y)){
Y[i] ~ dnorm(m[i], s^(-2))
m[i] <- a + b[X[i]] + c * Z[i]
}
# Prior models for a, b, c, s
a ~ dnorm(0, 200^(-2))
b[1] <- 0
b[2] ~ dnorm(0, 200^(-2))
c ~ dnorm(0, 20^(-2))
s ~ dunif(0, 200)
}"
# COMPILE the model
rail_jags_2 <- jags.model(textConnection(rail_model_2),
data = list(Y=RailTrail$volume, X=RailTrail$weekday, Z=RailTrail$hightemp),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 10)
)
# SIMULATE the posterior
rail_sim_2 <- coda.samples(model = rail_jags_2, variable.names = c("a", "b", "c", "s"), n.iter = 10000)
# Store the chains in a data frame
rail_chains_2 <- data.frame(rail_sim_2[[1]])
# PLOT the posterior
plot(rail_sim_2)
# Summarize the posterior Markov chains
summary(rail_sim_2)
# Plot the posterior mean regression models
ggplot(RailTrail, aes(x=hightemp, y=volume, color=weekday)) +
geom_point() +
geom_abline(intercept = mean(rail_chains_2[, "a"]), slope = mean(rail_chains_2[, "c"]), color = "red") +
geom_abline(intercept = mean(rail_chains_2[, "a"]) + mean(rail_chains_2[, "b.2."]), slope = mean(rail_chains_2[, "c"]), color = "turquoise3")
# Posterior probability that typical volume is lower on weekdays
mean(rail_chains_2$'b.2.' < 0)
# DEFINE the model
poisson_model <- "model{
# Likelihood model for Y[i]
for(i in 1:length(Y)) {
Y[i] ~ dpois(l[i])
log(l[i]) <- a + b[X[i]] + c * Z[i]
}
# Prior models for a, b, c
a ~ dnorm(0, 200^(-2))
b[1] <- 0
b[2] ~ dnorm(0, 2^(-2))
c ~ dnorm(0, 2^(-2))
}"
# COMPILE the model
poisson_jags <- jags.model(textConnection(poisson_model),
data = list(Y=RailTrail$volume, X=RailTrail$weekday, Z=RailTrail$hightemp),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 10)
)
# SIMULATE the posterior
poisson_sim <- coda.samples(model = poisson_jags, variable.names = c("a", "b", "c"), n.iter = 10000)
# Store the chains in a data frame
poisson_chains <- data.frame(poisson_sim[[1]])
# PLOT the posterior
plot(poisson_sim)
# Summarize the posterior Markov chains
summary(poisson_sim)
# Plot the posterior mean regression models
ggplot(RailTrail, aes(x = hightemp, y = volume, color = weekday)) +
geom_point() +
stat_function(fun = function(x){exp(5.01352 + 0.01426 * x)}, color = "red") +
stat_function(fun = function(x){exp(5.01352 - 0.12800 + 0.01426 * x)}, color = "turquoise3")
# Calculate the typical volume on 80 degree weekends & 80 degree weekdays
poisson_chains <- poisson_chains %>%
mutate(l_weekend=exp(a + c*80)) %>%
mutate(l_weekday=exp(a + b.2. + c*80))
# Construct a 95% CI for typical volume on 80 degree weekend
quantile(poisson_chains$l_weekend, c(0.025, 0.975))
# Construct a 95% CI for typical volume on 80 degree weekday
quantile(poisson_chains$l_weekday, c(0.025, 0.975))
# Simulate weekend & weekday predictions under each parameter set
poisson_chains <- poisson_chains %>%
mutate(Y_weekend=rpois(nrow(poisson_chains), l_weekend)) %>%
mutate(Y_weekday=rpois(nrow(poisson_chains), l_weekday))
# Print the first 6 sets of parameter values & predictions
head(poisson_chains)
# Construct a density plot of the posterior weekday predictions
ggplot(poisson_chains, aes(x=Y_weekday)) +
geom_density()
# Posterior probability that weekday volume is less 400
mean(poisson_chains$Y_weekday < 400)
Chapter 1 - Can I run my application in parallel?
Partitioning problems in to independent pieces:
Models of parallel computing:
R packages for parallel computing:
Example code includes:
extract_words <- function(book_name) {
# extract the text of the book
text <- subset(austen_books(), book == book_name)$text
# extract words from the text and convert to lowercase
str_extract_all(text, boundary("word")) %>% unlist %>% tolower
}
janeausten_words <- function() {
# Names of the six books contained in janeaustenr
books <- austen_books()$book %>% unique %>% as.character
# Vector of words from all six books
words <- sapply(books, extract_words) %>% unlist
words
}
austen_books <- function ()
{
books <- list('Sense & Sensibility' = janeaustenr::sensesensibility,
'Pride & Prejudice' = janeaustenr::prideprejudice,
'Mansfield Park' = janeaustenr::mansfieldpark,
'Emma' = janeaustenr::emma,
'Northanger Abbey' = janeaustenr::northangerabbey,
'Persuasion' = janeaustenr::persuasion
)
ret <- data.frame(text = unlist(books, use.names = FALSE), stringsAsFactors = FALSE)
ret$book <- factor(rep(names(books), sapply(books, length)))
ret$book <- factor(ret$book, levels = unique(ret$book))
structure(ret, class = c("tbl_df", "tbl", "data.frame"))
}
max_frequency <- function(letter, words, min_length = 1) {
w <- select_words(letter, words = words, min_length = min_length)
frequency <- table(w)
frequency[which.max(frequency)]
}
select_words <- function(letter, words, min_length = 1) {
min_length_words <- words[nchar(words) >= min_length]
grep(paste0("^", letter), min_length_words, value = TRUE)
}
# Vector of words from all six books
words <- janeausten_words()
# Most frequent "a"-word that is at least 5 chars long
max_frequency(letter = "a", words = words, min_length = 5)
## again
## 1001
# Partitioning
result <- lapply(letters, FUN=max_frequency,
words = words, min_length = 5) %>% unlist()
# barplot of result
barplot(result, las = 2)
replicates <- 50
sample_size <- 10000
# Function that computes mean of normal random numbers
myfunc <- function(n, ...) mean(rnorm(n, ...))
# Init result, set seed & repeat the task sequentially
result <- rep(NA, replicates)
set.seed(123)
for(iter in 1:replicates) result[iter] <- myfunc(sample_size)
# View result
hist(result)
# Use sapply() with different distribution parameters
hist(sapply(rep(sample_size, replicates), FUN=myfunc, mean = 10, sd = 5))
# We'll now introduce a demographic model to be used throughout the course. It projects net migration rates via an AR(1) model, rate(t+1) - µ = ?(rate(t) -µ) + error with variance s2
# An MCMC estimation for the USA resulted in 1000 samples of parameters µ, ? and s
# The task is to project the future distribution of migration rates
ar1_trajectory <- function(est, rate0, len = 15) {
trajectory <- rep(NA, len)
rate <- rate0
for (time in seq_len(len)) {
trajectory[time] <- ar1(est, r = rate)
rate <- trajectory[time]
}
trajectory
}
ar1 <- function(est, r) {
est['mu'] + est['phi'] * (r - est['mu']) +
rnorm(1, sd = est['sigma'])
}
ar1_block <- function(id, rate0 = 0.015, traj_len = 15, block_size = 10) {
trajectories <- matrix(NA, nrow = block_size, ncol = traj_len)
for (i in seq_len(block_size))
trajectories[i,] <- ar1_trajectory(unlist(ar1est[id, ]), rate0 = rate0, len = traj_len)
trajectories
}
show_migration <- function(trajs) {
df <- data.frame(time = seq(2020, by = 5, len = ncol(trajs)),
migration_rate = apply(trajs, 2, median),
lower = apply(trajs, 2, quantile, 0.1),
upper = apply(trajs, 2, quantile, 0.9)
)
g <- ggplot(df, aes(x = time, y = migration_rate)) +
geom_ribbon(aes(ymin = lower, ymax = upper), fill = "grey70") +
geom_line()
print(g)
}
# Simulate from multiple rows of the estimation dataset
ar1_multblocks <- function(ids, ...) {
trajectories <- NULL
for (i in seq_along(ids)) {
trajectories <- rbind(trajectories, ar1_block(ids[i], ...))
}
trajectories
}
ar1est <- data.frame(mu=c(0.0105, 0.0185, 0.022, 0.0113, 0.0144, 0.0175, -9e-04, 0.0093, 0.0111, -9e-04, -0.0024, 0.0086, 0.012, 0.0161, 0.0043, 0.0175, 0.0118, 0.0019, 0.0116, 0.0048, 0.0154, 0.0137, 0.0168, 0.0191, 0.0108, -0.0037, 0.0135, 0.0203, -0.0042, 0.0097, 0.0209, 0.0034, 0.0113, 0.0102, 0.0094, -0.0012, 0.008, 0.0082, 0.0123, 0.0175, 0.0054, -0.0087, 0.0161, 0.0155, 0.0126, 0.0181, 0.014, -0.0135, -0.0095, 0.0142, 0.011, 0.0194, 0.0149, 0.0115, 0.0129, -0.0124, 0.0116, 0.0136, 0.0161, 0.005, 0.0165, -0.0079, 0.0129, -0.0016, -7e-04, 0.0243, 0.0193, -0.004, 0.0145, 0.0078, 0.0156, 0.001, 0.0032, 0.0069, 0.0146, 0.0164, 0.0113, 0.0116, 0.0182, 0.0167, -0.0031, 0.0168, 0.0137, 0.012, -0.0212, -0.0092, 0.019, 0.0167, -0.0021, 0.0156, 0.0173, 0.0148, -0.0036, 0.0168, 0.0179, 0.0086, 0.0131, 0.015, 0.0106, 0.0132, 0.0119, 0.0156, 0.0159, 0.0256, 0.0071, 0.0163, 0.0107, 0.0139, 0.0228, 0.0139, 0.0117, 0.0133, 0.0127, -0.0162, 0.0115, 0.0095, 0.0183, 0.0183, -6e-04, 0.0177, 0.0145, 0.0041, 0.0143, 0.0135, -0.0078, 0.0036, 0.015, 0.018, 0.0158, 0.0054, -0.0204, 0.0193, 0.0051, 0.0144, 0.0129, 0.0134, 0.0116, 0.0102, 0.0203, 0.0154, 0.0106, 0.0184, 0.0096, -0.0032, 0.0143, 0.0158, 0.0093, 0.0159, 0.0112, 0.0106, 0.0075, 0.0133, 0.0171, 0.0133, 0.0139, 0.0167, 0.0131, -0.0078, 0.0135, 0.0145, 0.0104, 8e-04, 0.0205, 0.0046, 0.011, 0.0148, 0.0202, 8e-04, 0.0211, 0.0135, -8e-04, -0.0104, -0.0027, 0.0094, 0.0179, -0.0101, 0.0156, 0.0155, 0.014, 0.0149, 0.0165, 0.0168, 0.0155, 0.0136, 0.0156, 0.0149, 0.0191, 0.0176, 0.0094, -0.0076, 0.0162, 0.0143, 0.0182, 0.0102, 0.015, -0.0292, 0.0063, -0.0028, 0.0163, 0.015),
sigma=c(0.0081, 0.0053, 0.0069, 0.0075, 0.0082, 0.006, 0.0101, 0.011, 0.0064, 0.0066, 0.0095, 0.0057, 0.0078, 0.005, 0.0076, 0.0064, 0.0067, 0.0049, 0.0086, 0.0067, 0.0063, 0.0054, 0.0063, 0.0077, 0.0072, 0.0074, 0.0067, 0.0047, 0.0125, 0.0069, 0.0052, 0.0073, 0.0063, 0.0072, 0.0086, 0.0079, 0.009, 0.006, 0.0077, 0.0061, 0.0082, 0.0072, 0.0054, 0.0056, 0.0072, 0.0085, 0.0064, 0.0058, 0.0064, 0.0084, 0.0075, 0.006, 0.0048, 0.0068, 0.0065, 0.0082, 0.0072, 0.0056, 0.0056, 0.0055, 0.0054, 0.0059, 0.0064, 0.0069, 0.0073, 0.0071, 0.0057, 0.0062, 0.0086, 0.0062, 0.0054, 0.0052, 0.0066, 0.0076, 0.0046, 0.0056, 0.0066, 0.0077, 0.0074, 0.0061, 0.0056, 0.0065, 0.0069, 0.0084, 0.0058, 0.007, 0.0074, 0.0077, 0.0081, 0.0083, 0.0054, 0.0057, 0.0076, 0.0119, 0.0056, 0.0078, 0.005, 0.0073, 0.0075, 0.0054, 0.0085, 0.011, 0.0063, 0.0056, 0.009, 0.0069, 0.008, 0.0063, 0.007, 0.0059, 0.0064, 0.006, 0.0103, 0.0085, 0.006, 0.0076, 0.0054, 0.0066, 0.0056, 0.0071, 0.0079, 0.007, 0.0085, 0.0075, 0.007, 0.0085, 0.006, 0.0067, 0.006, 0.0074, 0.0098, 0.0066, 0.0058, 0.0075, 0.0064, 0.0059, 0.0103, 0.0055, 0.0053, 0.0068, 0.0057, 0.009, 0.0118, 0.0096, 0.0085, 0.0075, 0.0078, 0.0041, 0.0056, 0.008, 0.0071, 0.006, 0.0046, 0.0061, 0.007, 0.0061, 0.0066, 0.0075, 0.0094, 0.0072, 0.008, 0.0064, 0.0079, 0.0068, 0.0069, 0.0058, 0.0056, 0.0057, 0.0065, 0.006, 0.0073, 0.0067, 0.0068, 0.0071, 0.0048, 0.0071, 0.0063, 0.0051, 0.0079, 0.0042, 0.0048, 0.0066, 0.0072, 0.0058, 0.0057, 0.0083, 0.0063, 0.0057, 0.0103, 0.0096, 0.0067, 0.0051, 0.0075, 0.0064, 0.0069, 0.007, 0.007, 0.0074, 0.0056, 0.006),
phi=c(0.42, 0.3509, 0.8197, 0.5304, 0.1491, 0.3675, 0.9687, 0.7877, 0.7114, 0.9435, 0.9634, 0.9189, 0.4758, 0.5738, 0.8016, 0.0509, 0.8281, 0.8168, 0.7442, 0.9347, 0.1699, 0.3566, 0.8388, 0.7724, 0.7474, 0.7834, 0.6661, 0.5162, 0.9025, 0.5306, 0.6912, 0.7625, 0.8289, 0.6985, 0.9188, 0.9639, 0.3178, 0.7288, 0.4129, 0.2196, 0.9304, 0.9697, 0.193, 0.1474, 0.3111, 0.8844, 0.7386, 0.9674, 0.9983, 0.4863, 0.9338, 0.7999, 0.4696, 0.5078, 0.5141, 0.9958, 0.6404, 0.2886, 0.4171, 0.9856, 0.3261, 0.9713, 0.682, 0.7686, 0.8577, 0.9481, 0.6057, 0.934, 0.3161, 0.9414, 0.8349, 0.8325, 0.8913, 0.7726, 0.7327, 0.1403, 0.8144, 0.7506, 0.225, 0.4884, 0.9052, 0.2891, 0.1652, 0.7612, 0.9403, 0.9865, 0.4107, 0.6518, 0.893, 0.4981, 0.72, 0.3366, 0.8437, 0.2551, 0.7753, 0.5, 0.7857, 0.7107, 0.5643, 0.2887, 0.9621, 0.2384, 0.414, 0.86, 0.6917, 0.4946, 0.2325, 0.3419, 0.9219, 0.2706, 0.717, 0.2327, 0.7541, 0.9692, 0.5838, 0.9346, 0.4739, 0.3219, 0.9634, 0.3046, 0.9913, 0.8485, 0.3071, 0.0373, 0.9183, 0.7935, 0.0039, 0.5968, 0.3654, 0.595, 0.9712, 0.2745, 0.6027, 0.7441, 0.7641, 0.3582, 0.3397, 0.7748, 0.8188, 0.0604, 0.5076, 0.2856, 0.6859, 0.6705, 0.0326, 0.8749, 0.2596, 0.1138, 0.6072, 0.4, 0.9241, 0.612, 0.2375, 0.2495, 0.0661, 0.3234, 0.7651, 0.8581, 0.4818, 0.7303, 0.7458, 0.8925, 0.2861, 0.982, 0.0791, 0.2474, 0.4326, 0.8757, 0.5288, 0.6476, 0.8473, 0.9098, 0.9562, 0.8464, 0.5444, 0.9738, 0.706, 0.0795, 0.391, 0.3167, 0.3311, 0.5681, 0.27, 0.9046, 0.2299, 0.2299, 0.085, 0.4002, 0.7443, 0.9865, 0.7028, 0.9016, 0.6092, 0.2367, 0.5402, 0.9401, 0.8013, 0.993, 0.2473, 0.6414)
)
str(ar1est)
## 'data.frame': 200 obs. of 3 variables:
## $ mu : num 0.0105 0.0185 0.022 0.0113 0.0144 0.0175 -0.0009 0.0093 0.0111 -0.0009 ...
## $ sigma: num 0.0081 0.0053 0.0069 0.0075 0.0082 0.006 0.0101 0.011 0.0064 0.0066 ...
## $ phi : num 0.42 0.351 0.82 0.53 0.149 ...
# Generate trajectories for all rows of the estimation dataset
trajs <- ar1_multblocks(seq_along(nrow(ar1est)), rate0 = 0.015, block_size = 10, traj_len = 15)
# Show results
show_migration(trajs)
# Load package
library(parallel)
# How many physical cores are available?
ncores <- detectCores(logical = FALSE)
# Create a cluster
cl <- makeCluster(ncores)
# Process rnorm in parallel
clusterApply(cl, 1:ncores, fun = rnorm, mean = 10, sd = 2)
## [[1]]
## [1] 9.090645
##
## [[2]]
## [1] 8.414394 8.968384
# Evaluate partial sums in parallel
part_sums <- clusterApply(cl, x = c(1, 51), fun = function(x) sum(x:(x + 49)))
# Total sum
total <- sum(unlist(part_sums))
# Check for correctness
total == sum(1:100)
## [1] TRUE
# Stop the cluster
stopCluster(cl)
# Create a cluster and set parameters
cl <- makeCluster(2)
replicates <- 50
sample_size <- 10000
# Parallel evaluation
means <- clusterApply(cl, x = rep(sample_size, replicates), fun = myfunc)
# View results as histogram
hist(unlist(means))
Chapter 2 - The parallel package
Cluster basics:
Core of parallel:
Initialization of nodes:
Subsetting data:
Example code includes:
# Load parallel and create a cluster
library(parallel)
cl <- makeCluster(4)
# Investigate the cl object and its elements
typeof(cl)
## [1] "list"
length(cl)
## [1] 4
typeof(cl[[3]])
## [1] "list"
cl[[3]]$rank
## [1] 3
# What is the process ID of the workers
clusterCall(cl, Sys.getpid)
## [[1]]
## [1] 21008
##
## [[2]]
## [1] 31884
##
## [[3]]
## [1] 35200
##
## [[4]]
## [1] 35760
# Stop the cluster
stopCluster(cl)
# Define ncores and a print function
ncores <- 2
print_ncores <- function() print(ncores)
# Create a socket and a fork clusters
# cl_sock <- makeCluster(ncores, type = "PSOCK")
# cl_fork <- makeCluster(ncores, type = "FORK") # this is possible only on OS other than Windows
# Evaluate the print function on each cluster
# clusterCall(cl_sock, print_ncores) # this will fail since the socket has no knowledge of the main environment
# clusterCall(cl_fork, print_ncores)
# Change ncores and evaluate again
# ncores <- 4
# clusterCall(cl_fork, print_ncores) # the fork is only of the original environment, so these clusters will still think the answer is 2
# In this exercise, you will take the simple embarrassingly parallel application for computing mean of random numbers (myfunc()) from the first chapter, and implement two functions:
# One that runs the application sequentially, mean_seq(), and one that runs it in parallel, mean_par()
# Both functions have three arguments, n (sample size), repl (number of replicates) and ... (passed to myfunc())
# Function mean_par() assumes a cluster object cl to be present in the global environment
# Function to run repeatedly
myfunc <- function(n, ...) mean(rnorm(n, ...))
# Sequential solution
mean_seq <- function(n, repl, ...) {
res <- rep(NA, repl)
for (it in 1:repl) res[it] <- myfunc(n, ...)
res
}
# Parallel solution
mean_par <- function(n, repl, ...) {
res <- clusterApply(cl, x = rep(n, repl), fun = myfunc, ...)
unlist(res)
}
# Load packages
library(parallel)
library(microbenchmark)
# Create a cluster
cl <- makeCluster(2)
# Compare run times
microbenchmark(mean_seq(3000000, repl = 4),
mean_par(3000000, repl = 4),
mean_seq(100, repl = 100),
mean_par(100, repl = 100),
times = 1, unit = "s")
## Unit: seconds
## expr min lq mean median uq
## mean_seq(3e+06, repl = 4) 2.1622809 2.1622809 2.1622809 2.1622809 2.1622809
## mean_par(3e+06, repl = 4) 1.3131428 1.3131428 1.3131428 1.3131428 1.3131428
## mean_seq(100, repl = 100) 0.0037029 0.0037029 0.0037029 0.0037029 0.0037029
## mean_par(100, repl = 100) 0.1783277 0.1783277 0.1783277 0.1783277 0.1783277
## max neval
## 2.1622809 1
## 1.3131428 1
## 0.0037029 1
## 0.1783277 1
# Stop cluster
stopCluster(cl)
# Load extraDistr on master
library(extraDistr)
##
## Attaching package: 'extraDistr'
## The following object is masked from 'package:purrr':
##
## rdunif
# Define myrdnorm
myrdnorm <- function(n, mean = 0, sd = 1)
rdnorm(n, mean = mean, sd = sd)
# Run myrdnorm in parallel - should fail
# res <- clusterApply(cl, rep(1000, 20), myrdnorm, sd = 6) # will error out
# Load extraDistr on all workers
cl <- makeCluster(2)
clusterEvalQ(cl, library(extraDistr))
## [[1]]
## [1] "extraDistr" "stats" "graphics" "grDevices" "utils"
## [6] "datasets" "methods" "base"
##
## [[2]]
## [1] "extraDistr" "stats" "graphics" "grDevices" "utils"
## [6] "datasets" "methods" "base"
# Run myrdnorm in parallel again and show results
res <- clusterApply(cl, rep(1000, 20), myrdnorm, sd = 6)
hist(unlist(res))
# myrdnorm that uses global variables
myrdnorm <- function(n) rdnorm(n, mean = mean, sd = sd)
# Initialize workers
clusterEvalQ(cl, {
library(extraDistr)
mean=10
sd=5
})
## [[1]]
## [1] 5
##
## [[2]]
## [1] 5
# Run myrdnorm in parallel and show results
res <- clusterApply(cl, rep(1000, 100), myrdnorm)
# View results
hist(unlist(res))
# Set global objects on master
mean <- 20
sd <- 10
# Export global objects to workers
clusterExport(cl, c("mean", "sd"))
# Load extraDistr on workers
clusterEvalQ(cl, library(extraDistr))
## [[1]]
## [1] "extraDistr" "stats" "graphics" "grDevices" "utils"
## [6] "datasets" "methods" "base"
##
## [[2]]
## [1] "extraDistr" "stats" "graphics" "grDevices" "utils"
## [6] "datasets" "methods" "base"
# Run myrdnorm in parallel and show results
res <- clusterApply(cl, rep(1000, 100), myrdnorm)
hist(unlist(res))
select_words <- function(letter, words, min_length = 1) {
min_length_words <- words[nchar(words) >= min_length]
grep(paste0("^", letter), min_length_words, value = TRUE)
}
# Export "select_words" to workers
clusterExport(cl, "select_words")
# Split indices for two chunks
ind <- splitIndices(length(words), 2)
# Find unique words in parallel
result <- clusterApply(cl, x = list(words[ind[[1]]], words[ind[[2]]]),
function(w, ...) unique(select_words("v", w, ...)),
min_length = 10)
# Show vectorized unique results
unique(unlist(result))
## [1] "voluntarily" "variations" "vindication" "violoncello"
## [5] "vouchsafed" "veneration" "volatility" "volubility"
## [9] "vigorously" "villainous" "vindicating" "vulnerable"
## [13] "vicissitudes" "vegetation" "vulgarisms" "valetudinarian"
## [17] "vindicated" "vouchsafing" "voluminous" "vehemently"
## [21] "valancourt" "venerating" "viscountess" "vanquished"
# Earlier you defined a function ar1_multblocks() that takes a vector of row identifiers as argument and generates migration trajectories using the corresponding rows of the parameter set ar1est
# ar1_multblocks() depends on ar1_block() which in turns depends on ar1_trajectory()
# These functions along with the cluster object cl of size 4, function show_migration(), the dataset ar1est (reduced to 200 rows) and packages parallel and ggplot2 are available in your workspace
ar1_block <- function(id, rate0 = 0.015, traj_len = 15, block_size = 10) {
trajectories <- matrix(NA, nrow = block_size, ncol=traj_len)
for (i in seq_len(block_size))
trajectories[i,] <- ar1_trajectory(unlist(ar1est[id, ]), rate0 = rate0, len = traj_len)
trajectories
}
ar1_trajectory <- function(est, rate0, len = 15) {
ar1 <- function(est, r) {
# simulate one AR(1) value
est['mu'] + est['phi'] * (r - est['mu']) +
rnorm(1, sd = est['sigma'])
}
trajectory <- rep(NA, len)
rate <- rate0
for (time in seq_len(len)) {
trajectory[time] <- ar1(est, r = rate)
rate <- trajectory[time]
}
trajectory
}
ar1_multblocks <- function(ids, ...) {
trajectories <- NULL
for (i in seq_along(ids))
trajectories <- rbind(trajectories, ar1_block(ids[i], ...))
trajectories
}
# Export data and functions
clusterExport(cl, c("ar1est", "ar1_block", "ar1_trajectory"))
# Process ar1_multblocks in parallel
res <- clusterApply(cl, 1:nrow(ar1est), ar1_multblocks)
# Combine results into a matrix and show results
trajs <- do.call(rbind, res)
show_migration(trajs)
# The object res returned by clusterApply() in the previous exercise is also in your workspace, now called res_prev
res_prev <- res
# Split task into 5 chunks
ind <- splitIndices(nrow(ar1est), 5)
# Process ar1_multblocks in parallel
res <- clusterApply(cl, ind, ar1_multblocks)
# Dimensions of results
(res_dim <- c(length(res), nrow(res[[1]])))
## [1] 5 400
(res_prev_dim <- c(length(res_prev), nrow(res_prev[[1]])))
## [1] 200 10
stopCluster(cl)
Chapter 3 - foreach, future.apply, and Load Balancing
foreach:
foreach and parallel backends:
future and future.apply - packages that are continually under development:
Load balancing and scheduling:
*, 100, future.scheduling = 1) # 1 chunk per worker*, 100, future.scheduling = FALSE) # 1 chunk per taskExample code includes:
# Recall the first chapter where you found the most frequent words from the janeaustenr package that are of certain minimum length
result <- lapply(letters, max_frequency, words = words, min_length = 5) %>%
unlist
# In this exercise, you will implement the foreach construct to solve the same problem
# The janeaustenr package, a vector of all words from the included books, words, and a function max_frequency() for finding the results based on a given starting letter are all available in your workspace
# Load the package
library(foreach)
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
# foreach construct
result <- foreach(l = letters, .combine=c) %do% max_frequency(l, words=words, min_length=5)
# Plot results
barplot(result, las = 2)
# Specifically, your job is to modify the code so that the maximum frequency for the first half of the alphabet is obtained for words that are two and more characters long, while the frequency corresponding to the second half of the alphabet is derived from words that are six and more characters long
# Note that we are using an alphabet of 26 characters
# foreach construct and combine into vector
result <- foreach(l = letters, n = rep(c(2, 6), each=13), .combine = c) %do%
max_frequency(l, words=words, min_length=n)
# Plot results
barplot(result, las = 2)
# Register doParallel with 2 cores
doParallel::registerDoParallel(cores=2)
# Parallel foreach loop
res <- foreach(r = rep(1000, 100), .combine = rbind,
.packages = "extraDistr") %dopar% myrdnorm(r)
# Dimensions of res
dim_res <- dim(res)
# So far you learned how to search for the most frequent word in a text sequentially using foreach()
# In the course of the next two exercises, you will implement the same task using doParallel and doFuture for parallel processing and benchmark it against the sequential version
# The sequential solution is implemented in function freq_seq() (type freq_seq in your console to see it)
# It iterates over a global character vector chars and calls the function max_frequency() which searches within a vector of words, while filtering for minimum word length
# All these objects are preloaded, as is the doParallel package
# Your job now is to write a function freq_doPar() that runs the same code in parallel via doParallel
freq_seq <- function(min_length = 5)
foreach(l = letters, .combine = c) %do%
max_frequency(l, words = words, min_length = min_length)
# Function for doParallel foreach
freq_doPar <- function(cores, min_length = 5) {
# Register a cluster of size cores
doParallel::registerDoParallel(cores=cores)
# foreach loop
foreach(l=letters, .combine=c,
.export = c("max_frequency", "select_words", "words"),
.packages = c("janeaustenr", "stringr")) %dopar%
max_frequency(l, words=words, min_length=min_length)
}
# Run on 2 cores
freq_doPar(cores=2)
## again being could darcy every first great herself
## 1001 1445 3613 373 1456 972 981 1360
## indeed jennings knightley little might never other place
## 664 199 356 1295 1369 1362 1084 503
## quite really should there under visit would xviii
## 870 504 1541 2209 293 294 3238 4
## young zealous
## 766 5
# Now your job is to create a function freq_doFut() that accomplishes the same task as freq_doPar() but with the doFuture backend
# Note that when using doFuture, arguments .packages and .export in foreach() are not necessary, as the package deals with the exports automatically
# You will then benchmark these two functions, together with the sequential freq_seq()
# All the functions from the last exercise are available in your workspace
# In addition, the packages doFuture and microbenchmark are also preloaded
# To keep the computation time low, the global chars vector is set to the first six letters of the alphabet only
cores <- 2
min_length <- 5
# Error in tweak.function(strategy, ..., penvir = penvir) :
# Trying to use non-future function 'survival::cluster': function (x) { ... }
# For solution see https://github.com/HenrikBengtsson/future/issues/152
# Function for doFuture foreach
freq_doFut <- function(cores, min_length = 5) {
# Register and set plan
doFuture::registerDoFuture()
future::plan(future::cluster, workers=cores)
# foreach loop
foreach(l = letters, .combine = c) %dopar%
max_frequency(l, words = words, min_length = min_length)
}
# Benchmark
microbenchmark(freq_seq(min_length),
freq_doPar(cores, min_length),
freq_doFut(cores, min_length),
times = 1)
## Unit: seconds
## expr min lq mean median uq
## freq_seq(min_length) 8.72952 8.72952 8.72952 8.72952 8.72952
## freq_doPar(cores, min_length) 11.43931 11.43931 11.43931 11.43931 11.43931
## freq_doFut(cores, min_length) 13.83650 13.83650 13.83650 13.83650 13.83650
## max neval
## 8.72952 1
## 11.43931 1
## 13.83650 1
# It is straight forward to swap parallel backends with foreach
# In this small example, you might not see any time advantage in running it in parallel
# In addition, doFuture is usually somewhat slower than doParallel
# This is because doFuture has a higher computation overhead
# We encourage you to test these frameworks on more time-consuming applications where an overhead become negligible relative to the overall processing time
extract_words_from_text <- function(text) {
str_extract_all(text, boundary("word")) %>%
unlist %>%
tolower
}
# Main function
freq_fapply <- function(words, chars=letters, min_length=5) {
unlist(future.apply::future_lapply(chars, FUN=max_frequency, words = words, min_length = min_length))
}
obama <- readLines("./RInputFiles/obama.txt")
obama_speech <- paste(obama[obama != ""], collapse=" ")
# Extract words and call freq_fapply
words <- extract_words_from_text(obama_speech)
res <- freq_fapply(words)
# Plot results
barplot(res, las = 2)
# Now imagine you are a user of the fictional package from the previous exercise
# At home you have a two-CPU Mac computer, and at work you use a Linux cluster with two 16-CPU computers, called "oisin" and "oscar"
# Your job is to write a function for each of the hardware that calls freq_fapply() while taking advantage of all available CPUs
# For the cluster, you set workers to a vector of computer names corresponding to the number of CPUs, i.e. 16 x "oisin" and 16 x "oscar"
# For a one-CPU environment, we have created a function fapply_seq()
# fapply_seq <- function(...) {
# future::plan(strategy="sequential")
# freq_fapply(words, letters, ...)
# }
# multicore function
# fapply_mc <- function(cores=2, ...) {
# plan(strategy="multicore", workers=cores)
# freq_fapply(words, letters, ...)
# }
# cluster function
# fapply_cl <- function(cores=NULL, ...) {
# # set default value for cores
# if(is.null(cores))
# cores <- rep(c("oisin", "oscar"), each = 16)
#
# # parallel processing
# plan(strategy="cluster", workers=cores)
# freq_fapply(words, letters, ...)
# }
# Note: Multicore does not work on Windows. We recommend using the 'multiprocess' or 'cluster' plan on Windows computers.
# Microbenchmark
# microbenchmark(fapply_seq = fapply_seq(),
# fapply_mc_2 = fapply_mc(cores=2),
# fapply_mc_10 = fapply_mc(cores=10),
# fapply_cl = fapply_cl(cores=2),
# times = 1)
# Which is the slowest?
# slowest1 <- "fapply_cl"
# This is because for a small number of tasks a sequential code can run faster than a parallel version due to the parallel overhead
# The cluster plan has usually the largest overhead and thus, should be used only for larger number of tasks
# The multicore may be more efficient when the number of workers is equal to the number of cores
# It uses shared memory, and thus is faster than cluster
# In your workspace there is a vector tasktime containing simulated processing times of 30 tasks (generated using runif())
# There is also a cluster object cl with two nodes
# Your job is to apply the function Sys.sleep() to tasktime in parallel using clusterApply() and clusterApplyLB() and benchmark them
# The parallel and microbenchmark packages are loaded
# We also provided functions for plotting cluster usage plots called plot_cluster_apply() and plot_cluster_applyLB()
# Both functions use functionality from the snow package
tasktime <- c(0.1328, 0.1861, 0.2865, 0.4541, 0.1009, 0.4492, 0.4723, 0.3304, 0.3146, 0.031, 0.1031, 0.0884, 0.3435, 0.1921, 0.3849, 0.2489, 0.3588, 0.496, 0.1901, 0.3887, 0.4674, 0.1062, 0.3259, 0.0629, 0.1337, 0.1931, 0.0068, 0.1913, 0.4349, 0.1702)
# plot_cluster_apply <- function(cl, x, fun)
# plot(snow::snow.time(snow::clusterApply(cl, x, fun)),
# title = "Cluster usage of clusterApply")
# plot_cluster_applyLB <- function(cl, x, fun)
# plot(snow::snow.time(snow::clusterApplyLB(cl, x, fun)),
# title = "Cluster usage of clusterApplyLB")
# Benchmark clusterApply and clusterApplyLB
# microbenchmark(
# clusterApply(cl, tasktime, Sys.sleep),
# clusterApplyLB(cl, tasktime, Sys.sleep),
# times = 1
# )
# Plot cluster usage
# plot_cluster_apply(cl, tasktime, Sys.sleep)
# plot_cluster_applyLB(cl, tasktime, Sys.sleep)
# Now we compare the results from the previous exercise with ones generated using parSapply(), which represents here an implementation that groups tasks into as many chunks as there are workers available
# You first explore its cluster usage plot, using the function plot_parSapply() we defined for you
# We generated a version of the tasktime vector, called bias_tasktime that generates very uneven load
# Your job is to compare the run times of parSapply() with clusterApplyLB() applied to bias_tasktime
# plot_parSapply <- function(cl, x, fun)
# plot(snow::snow.time(snow::parSapply(cl, x, fun)),
# title = "Cluster usage of parSapply")
# bias_tasktime <- c(1, 1, 1, 0.1, 0.1, 0.1, 1e-04, 1e-04, 1e-04, 0.001, 1)
# Plot cluster usage for parSapply
# plot_parSapply(cl, tasktime, Sys.sleep)
# Microbenchmark
# microbenchmark(
# clusterApplyLB(cl, bias_tasktime, Sys.sleep),
# parSapply(cl, bias_tasktime, Sys.sleep),
# times = 1
# )
# Plot cluster usage for parSapply and clusterApplyLB
# plot_cluster_applyLB(cl, bias_tasktime, Sys.sleep)
# plot_parSapply(cl, bias_tasktime, Sys.sleep)
Chapter 4 - Random Numbers and Reproducibility
Are results reproducible?
set.seed(1234) clusterApply(cl, sample(1:10000000, 2), set.seed) print(clusterApply(cl, rep(3, 2), rnorm)) Parallel random number generators:
Reproducibility in foreach and future.apply:
Next steps:
Example code includes:
# In addition to the code in the previous exercise, we also created a FORK cluster for you.
# cl.fork <- makeCluster(2, type = "FORK")
# Your job is to register the two cluster objects with the preloaded doParallel package and compare results obtained with parallel foreach
# How do the results differ in terms of reproducibility?
library(doParallel)
## Loading required package: iterators
cl.sock <- makeCluster(2, type = "PSOCK")
registerDoParallel(cl.sock)
set.seed(100)
foreach (i = 1:2) %dopar% rnorm(3)
## [[1]]
## [1] -0.2671675 -0.1464838 1.1323965
##
## [[2]]
## [1] -0.5133412 -0.1970795 0.2736516
# Register and use cl.sock
registerDoParallel(cl.sock)
replicate(2, {
set.seed(100)
foreach(i = 1:2, .combine = rbind) %dopar% rnorm(3)
}, simplify = FALSE
)
## [[1]]
## [,1] [,2] [,3]
## result.1 -2.250581 0.03180824 0.267018
## result.2 -1.226201 0.91295147 -1.008805
##
## [[2]]
## [,1] [,2] [,3]
## result.1 0.2413098 1.3976016 0.08562643
## result.2 -1.7737211 -0.1632283 0.45858481
# Register and use cl.fork
# registerDoParallel(cl.fork)
# replicate(2, {
# set.seed(100)
# foreach(i = 1:2, .combine = rbind) %dopar% rnorm(3)
# }, simplify = FALSE
# )
# Create a cluster
cl <- makeCluster(2)
# Check RNGkind on workers
clusterCall(cl, RNGkind)
## [[1]]
## [1] "Mersenne-Twister" "Inversion" "Rejection"
##
## [[2]]
## [1] "Mersenne-Twister" "Inversion" "Rejection"
# Set the RNG seed on workers
clusterSetRNGStream(cl, iseed=100)
# Check RNGkind on workers
clusterCall(cl, RNGkind)
## [[1]]
## [1] "L'Ecuyer-CMRG" "Inversion" "Rounding"
##
## [[2]]
## [1] "L'Ecuyer-CMRG" "Inversion" "Rounding"
# Now you are ready to make your results reproducible
# You will use the simple embarrassingly parallel application for computing a mean of random numbers (myfunc) which we parallelized in the second chapter using clusterApply()
# The parallel package, myfunc() , n (sample size, set to 1000) and repl (number of replicates, set to 5) are available in your workspace
# You will now call clusterApply() repeatedly to check if results can be reproduced, without and with initializing the RNG
n <- 1000
repl <- 5
# Create a cluster of size 2
cl <- makeCluster(2)
# Call clusterApply three times
for(i in 1:3)
print(unlist(clusterApply(cl, rep(n, repl), myfunc)))
## [1] -0.04346884 0.01101790 -0.01044725 0.02097522 -0.02486239
## [1] -7.848246e-02 2.333154e-02 -1.722386e-02 1.961149e-02 8.339214e-05
## [1] -0.010402906 0.016466128 -0.033822199 -0.001632323 -0.002724153
# Create a seed object
seed <- 1234
# Repeatedly set the cluster seed and call clusterApply()
for(i in 1:3) {
clusterSetRNGStream(cl, iseed = seed)
print(unlist(clusterApply(cl, rep(n, repl), myfunc)))
}
## [1] -0.008597904 -0.006089337 -0.013980519 -0.066293388 0.004297755
## [1] -0.008597904 -0.006089337 -0.013980519 -0.066293388 0.004297755
## [1] -0.008597904 -0.006089337 -0.013980519 -0.066293388 0.004297755
# Create two cluster objects, of size 2 and 4
cl2 <- makeCluster(2)
cl4 <- makeCluster(4)
# Set seed on cl2 and call clusterApply
clusterSetRNGStream(cl2, iseed = seed)
unlist(clusterApply(cl2, rep(n, repl), myfunc))
## [1] -0.008597904 -0.006089337 -0.013980519 -0.066293388 0.004297755
# Set seed on cl4 and call clusterApply
clusterSetRNGStream(cl4, iseed = seed)
unlist(clusterApply(cl4, rep(n, repl), myfunc))
## [1] -0.008597904 -0.006089337 0.077876985 -0.072012937 -0.013980519
# Register doParallel and doRNG
library(doRNG)
## Loading required package: rngtools
## Loading required package: pkgmaker
## Loading required package: registry
##
## Attaching package: 'pkgmaker'
## The following object is masked from 'package:base':
##
## isFALSE
registerDoParallel(cores = 2)
doRNG::registerDoRNG(seed)
# Call ar1_block via foreach
mpar <- foreach(r=1:5) %dopar% ar1_block(r)
# Register sequential backend, set seed and run foreach
registerDoSEQ()
set.seed(seed)
mseq <- foreach(r=1:5) %dorng% ar1_block(r)
# Check if results identical
identical(mpar, mseq)
## [1] TRUE
# You are able to reproduce sequential and parallel applications! Remember to always use %dorng% if you use the doSEQ backend
# Also note that by default on the Linux DataCamp server, registerDoParallel() creates a FORK cluster if a number of cores is passed to it
# As a result, there was no need to export any functions to workers, as they were copied from the master
# On a different platform, the .export option may be needed
# Set multiprocess plan
future::plan(strategy="multiprocess", workers = 2)
# Call ar1_block via future_lapply
mfpar <- future.apply::future_lapply(1:5, FUN=ar1_block, future.seed=seed)
# Set sequential plan and repeat future_lapply
future::plan(strategy="sequential")
mfseq <- future.apply::future_lapply(1:5, FUN=ar1_block, future.seed=seed)
# Check if results are identical
identical(mfpar, mfseq)
## [1] TRUE
rm(mean)
rm(sd)
Chapter 1 - Quickstart Guide
Why choice?
Inspecting choice data:
Fitting and interpreting a choice model:
Using choice models to make decisions:
Example code includes:
# Unload conflicting namespaces
unloadNamespace("rms")
unloadNamespace("quantreg")
unloadNamespace("MatrixModels")
unloadNamespace("lmerTest")
unloadNamespace("semPlot")
unloadNamespace("rockchalk")
unloadNamespace("qgraph")
unloadNamespace("sem")
unloadNamespace("mi")
unloadNamespace("arm")
unloadNamespace("mice")
unloadNamespace("mitml")
unloadNamespace("jomo")
unloadNamespace("arm")
unloadNamespace("jomo")
unloadNamespace("lme4")
# load the mlogit library
library(mlogit)
scLong <- read.csv("./RInputFiles/sportscar_choice_long.csv")
scWide <- read.csv("./RInputFiles/sportscar_choice_wide.csv")
sportscar <- scLong
sportscar$alt <- as.factor(sportscar$alt)
sportscar$seat <- as.factor(sportscar$seat)
sportscar$price <- as.factor(sportscar$price)
sportscar$choice <- as.logical(sportscar$choice)
sportscar <- sportscar %>% rename(resp.id=resp_id)
sportscar$key <- rep(1:2000, each=3)
row.names(sportscar) <- paste(sportscar$key, sportscar$alt, sep=".")
sportscar <- mlogit.data(sportscar, shape="long", choice="choice", alt.var="alt")
str(sportscar)
# Create a table of chosen sportscars by transmission type
chosen_by_trans <- xtabs(choice ~ trans, data = sportscar)
# Print the chosen_by_trans table to the console
chosen_by_trans
# Plot the chosen_by_price object
barplot(chosen_by_trans)
# Crashes out due to issue with class "family" in MatrixModels and lme4
m1 <- mlogit(choice ~ seat + trans + convert + price, data=sportscar, seed=10)
# fit a choice model using mlogit() and assign the output to m1
# m1 <- mlogit::mlogit(choice ~ seat + trans + convert + price,
# data=sportscar,
# chid.var="key",
# alt.var="alt",
# choice="choice",
# seed=10
# )
# summarize the m1 object to see the output of the choice model
summary(m1)
predict_mnl <- function(model, products) {
# model: mlogit object returned by mlogit()
# data: a data frame containing the set of designs for which you want to
# predict shares. Same format at the data used to estimate model.
data.model <- model.matrix(update(model$formula, 0 ~ .), data = products)[,-1]
utility <- data.model%*%model$coef
share <- exp(utility)/sum(exp(utility))
cbind(share, products)
}
# inspect products
products <- data.frame(seat=factor("2", levels=c("2", "4", "5")),
trans=factor(rep(c("manual", "auto"), each=2), levels=c("auto", "manual")),
convert=factor(rep(c("no", "yes"), times=2), levels=c("no", "yes")),
price=factor("35", levels=c("30", "35", "40"))
)
str(products)
# use predict_mnl to predict share for products
shares <- predict_mnl(m1, products)
# print the shares to the console
shares
barplot(shares$share, ylab="Predicted Market Share",
names.arg=c("Our Car", "Comp 1", "Comp 2", "Comp 3"))
Chapter 2 - Managing and Summarizing Choice Data
Assembling choice data:
Converting from wide to long:
Choice data in two files:
Visualizing choce data:
Designing a conjoint survey:
rand_rows <- sample(1:nrow(all_comb), size=12*3) rand_alts <- all_comb[rand_rows, ] choc_survey[choc_survey$Subject==i, 4:6] <- rand_alts Example code includes:
chLong <- read.csv("./RInputFiles/chocolate_choice_long.csv")
chWide <- read.csv("./RInputFiles/chocolate_choice_wide.csv")
chocolate_wide <- chWide
# Look at the head() of chocolate_wide
head(chocolate_wide)
## Subject Trial Brand1 Brand2 Brand3 Price1 Price2 Price3 Type1
## 1 2401 1 Dove Godiva Dove 0.6 0.7 3.6 Milk
## 2 2401 2 Godiva Godiva Hershey's 2.7 3.9 0.7 Milk w/ Nuts
## 3 2401 3 Hershey's Godiva Hershey's 1.7 3.7 3.0 Dark w/ Nuts
## 4 2401 4 Lindt Lindt Ghirardelli 1.0 3.6 0.5 Milk
## 5 2401 5 Hershey's Godiva Hershey's 0.8 1.5 3.3 Milk w/ Nuts
## 6 2401 6 Lindt Dove Godiva 3.1 2.5 2.6 Milk
## Type2 Type3 Selection Response_Time
## 1 Dark White 1 5210
## 2 Dark Milk w/ Nuts 2 7480
## 3 Dark Dark 2 7704
## 4 Milk w/ Nuts Dark w/ Nuts 1 5774
## 5 Dark White 2 5238
## 6 White Dark 3 3423
# Use summary() to see which brands and types are in chocolate_wide
summary(chocolate_wide)
## Subject Trial Brand1 Brand2 Brand3
## Min. :2401 Min. : 1 Dove :60 Dove :85 Dove :69
## 1st Qu.:2405 1st Qu.: 7 Ghirardelli:58 Ghirardelli:67 Ghirardelli:61
## Median :2410 Median :13 Godiva :83 Godiva :74 Godiva :78
## Mean :2409 Mean :13 Hershey's :63 Hershey's :66 Hershey's :78
## 3rd Qu.:2413 3rd Qu.:19 Lindt :86 Lindt :58 Lindt :64
## Max. :2417 Max. :25
## Price1 Price2 Price3 Type1
## Min. :0.500 Min. :0.500 Min. :0.500 Dark :63
## 1st Qu.:1.100 1st Qu.:1.300 1st Qu.:1.300 Dark w/ Nuts:70
## Median :2.200 Median :2.400 Median :2.200 Milk :75
## Mean :2.144 Mean :2.255 Mean :2.233 Milk w/ Nuts:83
## 3rd Qu.:3.100 3rd Qu.:3.200 3rd Qu.:3.100 White :59
## Max. :4.000 Max. :4.000 Max. :4.000
## Type2 Type3 Selection Response_Time
## Dark :95 Dark :75 Min. :1.000 Min. : 1021
## Dark w/ Nuts:68 Dark w/ Nuts:75 1st Qu.:1.000 1st Qu.: 2750
## Milk :55 Milk :60 Median :2.000 Median : 3878
## Milk w/ Nuts:55 Milk w/ Nuts:67 Mean :1.926 Mean : 4713
## White :77 White :73 3rd Qu.:3.000 3rd Qu.: 5766
## Max. :3.000 Max. :24462
# use reshape() to change the data from long to wide
chocolate <- reshape(data= chocolate_wide , direction="long",
varying = list(Brand=3:5, Price=6:8, Type=9:11),
v.names=c("Brand", "Price", "Type"), timevar="Alt")
# use head() to confirm that the data has been properly transformed
head(chocolate)
## Subject Trial Selection Response_Time Alt Brand Price Type id
## 1.1 2401 1 1 5210 1 Dove 0.6 Milk 1
## 2.1 2401 2 2 7480 1 Godiva 2.7 Milk w/ Nuts 2
## 3.1 2401 3 2 7704 1 Hershey's 1.7 Dark w/ Nuts 3
## 4.1 2401 4 1 5774 1 Lindt 1.0 Milk 4
## 5.1 2401 5 2 5238 1 Hershey's 0.8 Milk w/ Nuts 5
## 6.1 2401 6 3 3423 1 Lindt 3.1 Milk 6
# Create the new order for the chocolate data frame
new_order <- order(chocolate$Subject, chocolate$Trial, chocolate$Alt)
# Reorder the chocolate data frame to the new_order
chocolate <- chocolate[new_order,]
# Look at the head() of chocolate to see how it has been reordered
head(chocolate)
## Subject Trial Selection Response_Time Alt Brand Price Type id
## 1.1 2401 1 1 5210 1 Dove 0.6 Milk 1
## 1.2 2401 1 1 5210 2 Godiva 0.7 Dark 1
## 1.3 2401 1 1 5210 3 Dove 3.6 White 1
## 2.1 2401 2 2 7480 1 Godiva 2.7 Milk w/ Nuts 2
## 2.2 2401 2 2 7480 2 Godiva 3.9 Dark 2
## 2.3 2401 2 2 7480 3 Hershey's 0.7 Milk w/ Nuts 2
# Use head(chocolate) and look at the Selection variable.
head(chocolate)
## Subject Trial Selection Response_Time Alt Brand Price Type id
## 1.1 2401 1 1 5210 1 Dove 0.6 Milk 1
## 1.2 2401 1 1 5210 2 Godiva 0.7 Dark 1
## 1.3 2401 1 1 5210 3 Dove 3.6 White 1
## 2.1 2401 2 2 7480 1 Godiva 2.7 Milk w/ Nuts 2
## 2.2 2401 2 2 7480 2 Godiva 3.9 Dark 2
## 2.3 2401 2 2 7480 3 Hershey's 0.7 Milk w/ Nuts 2
# Transform the Selection variable to a logical indicator
chocolate$Selection <- chocolate$Alt == chocolate$Selection
# Use head(chocolate) to see how the Selection variable has changed. Now it is logical.
head(chocolate)
## Subject Trial Selection Response_Time Alt Brand Price Type id
## 1.1 2401 1 TRUE 5210 1 Dove 0.6 Milk 1
## 1.2 2401 1 FALSE 5210 2 Godiva 0.7 Dark 1
## 1.3 2401 1 FALSE 5210 3 Dove 3.6 White 1
## 2.1 2401 2 FALSE 7480 1 Godiva 2.7 Milk w/ Nuts 2
## 2.2 2401 2 TRUE 7480 2 Godiva 3.9 Dark 2
## 2.3 2401 2 FALSE 7480 3 Hershey's 0.7 Milk w/ Nuts 2
choc_choice <- chocolate %>%
filter(Selection==TRUE) %>%
mutate(Selection=Alt) %>%
select(Subject, Trial, Response_Time, Selection)
choc_alts <- chocolate %>%
select(Subject, Trial, Alt, Brand, Price, Type)
str(choc_choice)
## 'data.frame': 350 obs. of 4 variables:
## $ Subject : int 2401 2401 2401 2401 2401 2401 2401 2401 2401 2401 ...
## $ Trial : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Response_Time: int 5210 7480 7704 5774 5238 3423 4691 3268 6719 3542 ...
## $ Selection : int 1 2 2 1 2 3 3 3 2 2 ...
str(choc_alts)
## 'data.frame': 1050 obs. of 6 variables:
## $ Subject: int 2401 2401 2401 2401 2401 2401 2401 2401 2401 2401 ...
## $ Trial : int 1 1 1 2 2 2 3 3 3 4 ...
## $ Alt : int 1 2 3 1 2 3 1 2 3 1 ...
## $ Brand : Factor w/ 5 levels "Dove","Ghirardelli",..: 1 3 1 3 3 4 4 3 4 5 ...
## $ Price : num 0.6 0.7 3.6 2.7 3.9 0.7 1.7 3.7 3 1 ...
## $ Type : Factor w/ 5 levels "Dark","Dark w/ Nuts",..: 3 1 5 4 1 4 2 1 1 3 ...
## - attr(*, "reshapeLong")=List of 4
## ..$ varying:List of 3
## .. ..$ Brand: chr "Brand1" "Brand2" "Brand3"
## .. ..$ Price: chr "Price1" "Price2" "Price3"
## .. ..$ Type : chr "Type1" "Type2" "Type3"
## ..$ v.names: chr "Brand" "Price" "Type"
## ..$ idvar : chr "id"
## ..$ timevar: chr "Alt"
# Merge choc_choice and choc_alts
choc_merge <- merge(choc_choice, choc_alts, by=c("Subject", "Trial"))
# Convert Selection to a logical variable
choc_merge$Selection <- choc_merge$Selection == choc_merge$Alt
# Inspect chocolate_merge using head
head(choc_merge)
## Subject Trial Response_Time Selection Alt Brand Price Type
## 1 2401 1 5210 TRUE 1 Dove 0.6 Milk
## 2 2401 1 5210 FALSE 2 Godiva 0.7 Dark
## 3 2401 1 5210 FALSE 3 Dove 3.6 White
## 4 2401 10 3542 FALSE 1 Lindt 0.6 Milk w/ Nuts
## 5 2401 10 3542 TRUE 2 Godiva 0.8 Milk w/ Nuts
## 6 2401 10 3542 FALSE 3 Hershey's 3.7 Dark
# Use xtabs to count up how often each Type is chosen
counts <- xtabs(~ Type + Selection, data=chocolate)
# Plot the counts
plot(counts, cex = 1.5)
# Modify this code to count up how many times each **Brand** is chosen
counts <- xtabs(~ Brand + Selection, data=chocolate)
# Plot the counts
plot(counts, cex = 1.5)
# Use xtabs to count up how often each Price is chosen
counts <- xtabs(~ Price + Selection, data=chocolate)
# Plot the counts
plot(counts, cex=0.6)
Chapter 3 - Building Choice Models
Choice models - under the hood:
Interpreting choice model parameters:
Intercepts and interactions:
Predicting shares:
data.model <- model.matrix(update(model$formula, 0 ~ .), data = products)[,-1] utility <- data.model%*%model$coef share <- exp(utility)/sum(exp(utility)) cbind(share, products) Example code includes:
# use mlogit.data() to convert chocolate to mlogit.data
chocolate_df <- mlogit.data(chocolate, shape = "long",
choice = "Selection", alt.var = "Alt",
varying = 6:8)
# use str() to confirm that chocolate is an mlogit.data object
str(chocolate_df)
# Fit a model with mlogit() and assign it to choc_m1
choc_m1 <- mlogit(Selection ~ Brand + Type + Price, data=chocolate_df, print.level=3)
# Summarize choc_m1 with summary()
summary(choc_m1)
# modify the call to mlogit to exclude the intercept
choc_m2 <- mlogit(Selection ~ 0 + Brand + Type + Price, data = chocolate_df, print.level=3)
# summarize the choc_m2 model
summary(choc_m2)
# compute the wtp by dividing the coefficient vector by the negative of the price coefficient
coef(choc_m2) / -coef(choc_m2)["Price"]
# change the Price variable to a factor in the chocolate data
chocolate$Price <- as.factor(chocolate$Price)
# fit a model with mlogit and assign it to choc_m3
choc_m3 <- mlogit(Selection ~ 0 + Brand + Type + Price, data=chocolate)
# inspect the coefficients
summary(choc_m3)
# likelihood ratio test comparing two models
lrtest(choc_m2, choc_m3)
# add the formula for mlogit
choc_m4 <- mlogit(Selection ~ 0 + Brand + Type + Price + Brand:Type, data=chocolate)
# use summary to see the coefficients
summary(choc_m4)
# add the formula for mlogit
choc_m5 <- mlogit(Selection ~ 0 + Brand + Type + Price + Price:Trial, data=chocolate)
# use summary to see the outputs
summary(choc_m5)
# add the formula for mlogit
choc_m5 <- mlogit(Selection ~ 0 + Brand + Type + Price + Price:Trial, data=chocolate)
# use summary to see the outputs
summary(choc_m5)
predict_mnl <- function(model, products) {
data.model <- model.matrix(update(model$formula, 0 ~ .),
data = products)[,-1]
utility <- data.model%*%model$coef
share <- exp(utility)/sum(exp(utility))
cbind(share, products)
}
# modify the code below so that the segement is set to "racer" for both alternatives
price <- c(35, 30)
seat <- factor(c(2, 2), levels=c(2,4,5))
trans <- factor(c("manual", "auto"), levels=c("auto", "manual"))
convert <- factor(c("no", "no"), levels=c("no", "yes"))
segment <- factor(c("racer", "racer"), levels=c("basic", "fun", "racer"))
prod <- data.frame(seat, trans, convert, price, segment)
# predict shares for the "racer" segment
predict_mnl(model=m5, products=prod)
# fit the choc_m2 model
choc_m2 <- mlogit(Selection ~ 0 + Brand + Type + Price, data=chocolate)
# create a data frame with the Ghiradelli products
Brand <- factor(rep("Ghirardelli", 5), level = levels(chocolate$Brand))
Type <- levels(chocolate$Type)
Price <- 3 # treated as a number in choc_m2
ghir_choc <- data.frame(Brand, Type, Price)
# predict shares
predict_mnl(model=choc_m2, products=ghir_choc)
# compute and save the share prediction
shares <- predict_mnl(choc_m2, ghir_choc)
# make a barplot of the shares
barplot(shares$share,
horiz = TRUE, col="tomato2",
xlab = "Predicted Market Share",
main = "Shares for Ghiradelli chocolate bars at $3 each",
names.arg = levels(chocolate$Type)
)
Chapter 4 - Hierarchical Choice Models
What is a hierarchical choice model?
beta[i] <- mvrnorm(1, beta_0, Sigma) # random normal vector for (j in 1:n_task[i]) { X <- X[X$resp == i & X$task == j, ] u <- X %*% beta[i] p[i,] <- exp(u) / sum(exp(u)) } Heterogeneity in preferences for other features:
Predicting shares with hierarchical models:
coef <- mvrnorm(1, mu=mean, Sigma=Sigma) utility <- prod.coded %*% coef share[i,] <- exp(utility) / sum(exp(utility)) Wrap up:
Example code includes:
# Determine the number of subjects in chocolate$Subjects
length(levels(chocolate$Subject))
# add id.var input to mlogit.data call
chocolate <- mlogit.data(chocolate, choice = "Selection", shape="long",
varying=6:8, alt.var = "Alt", id.var = "Subject"
)
# add rpar and panel inputs to mlogit call
choc_m6 <- mlogit(Selection ~ 0 + Brand + Type + Price, data = chocolate,
rpar = c(Price="n"), panel=TRUE)
# plot the model
plot(choc_m6)
# set the contrasts for Brand to effects code
contrasts(chocolate$Brand) <- contr.sum(levels(chocolate$Brand))
dimnames(contrasts(chocolate$Brand))[[2]] <- levels(chocolate$Brand)[1:4]
contrasts(chocolate$Brand)
# set the contrasts for Type to effects code
contrasts(chocolate$Type) <- contr.sum(levels(chocolate$Type))
dimnames(contrasts(chocolate$Type))[[2]] <- levels(chocolate$Type)[1:4]
contrasts(chocolate$Type)
# create my_rpar vector
choc_m2 <- mlogit(Selection ~ 0 + Brand + Type + Price, data=chocolate)
my_rpar <- rep("n", length(choc_m2$coef))
names(my_rpar) <- names(choc_m2$coef)
my_rpar
# fit model with random coefficients
choc_m7 <- mlogit(Selection ~ 0 + Brand + Type + Price, data=chocolate, rpar=my_rpar, panel=TRUE)
# print the coefficients
choc_m7$coef[5:8]
# compute the negative sum of those coefficients
-sum(choc_m7$coef[5:8])
# Extract the mean parameters and assign to mean
mean <- choc_m8$coef[1:9]
# Extract the covariance parameters and assign to Sigma
Sigma <- cov.mlogit(choc_m8)
# Create storage for individual draws of share
share <- matrix(NA, nrow=1000, ncol=nrow(choc_line_coded))
# For each draw (person)
for (i in 1:1000) {
# Draw a coefficient vector
coef <- mvrnorm(1, mu=mean, Sigma=Sigma)
# Compute utilities for those coef
utility <- choc_line_coded %*% coef
# Compute probabilites according to logit formuila
share[i,] <- exp(utility) / sum(exp(utility))
}
# Average the draws of the shares
cbind(colMeans(share), choc_line)
Chapter 1 - What Is RNA Single-Cell RNA-Seq?
Background and utility:
Typical workflow:
Load, create, and access data:
Example code includes:
# head of count matrix
counts[1:3, 1:3]
# count of specific gene and cell
alignedReads <- counts['Cnr1', "SRR2140055"]
# overall percentage of zero counts
pZero <- mean(counts == 0)
# cell library size
libsize <- colSums(counts)
# find cell coverage
coverage <- colMeans(counts > 0)
cell_info$coverage <- coverage
# load ggplot2
library(ggplot2)
# plot cell coverage
ggplot(cell_info, aes(x = names, y = coverage)) +
geom_point() +
ggtitle('Cell Coverage') +
xlab('Cell Name') +
ylab('Coverage')
# mean of GC content
gc_mean <- mean(gene_info$gc)
# standard deviation of GC content
gc_sd <- sd(gene_info$gc)
# boxplot of GC content
boxplot(gene_info$gc, main = 'Boxplot - GC content', ylab = 'GC content')
# batch
batch <- cell_info$batch
# patient
patient <- cell_info$patient
# nesting of batch within patient
batch_patient <- table(batch = batch, patient = patient)
# explore batch_patient
batch_patient
# load SingleCellExperiment
library(SingleCellExperiment)
# create a SingleCellExperiment object
sce <- SingleCellExperiment(assays = list(counts = counts ),
rowData = data.frame(gene_names = rownames(counts)),
colData = data.frame(cell_names = colnames(counts)))
# create a SummarizedExperiment object from counts
se <- SummarizedExperiment(assays = list(counts = counts))
# create a SingleCellExpression object from se
sce <- as(se, "SingleCellExperiment")
# create SingleCellExperiment object
sce <- as(allen, "SingleCellExperiment")
# cell information
cell_info <- colData(sce)
# size factors
sizeFactors(sce) <- colSums(assay(sce))
Chapter 2 - Quality Control and Normalization
Quality Control:
Quality Control (continued):
Normalization:
Example code includes:
# remove genes with only zeros
nonZero <- counts(sce) > 0
keep <- rowSums(nonZero) > 0
sce_2 <- sce[keep, ]
# spike-ins ERCC
isSpike(sce_2, "ERCC") <- grepl("^ERCC-", rownames(sce_2))
# load scater
library(scater)
# calculate QCs
sce <- calculateQCMetrics(sce, feature_controls = list(ERCC = isSpike(sce, "ERCC")))
# explore coldata of sce
colData(sce)
# set threshold
threshold <- 20000
# plot density
plot(density(sce@colData$total_counts), main = 'Density - total_counts')
abline(v = threshold)
# keep cells
keep <- sce@colData$total_counts > threshold
# tabulate kept cells
table(keep)
# set threshold
threshold <- 6000
# plot density
plot(density(sce$total_features), main = 'Density - total_features')
abline(v=threshold)
# keep cells
keep <- sce$total_features > threshold
# tabulate kept cells
table(keep)
#extract cell data into a data frame
cDataFrame <- as.data.frame(colData(sce))
# plot cell data
ggplot(cDataFrame, aes(x = total_counts, y = total_counts_ERCC, col = batch)) +
geom_point()
# keep cells
keep <- sce$batch != "NA19098.r2"
# tabulate kept cells
table(keep)
# load SingleCellExperiment
library(SingleCellExperiment)
# filter genes
filter_genes <- apply(counts(sce), 1, function(x){
length(x[x > 1]) > 1
})
# tabulate the results of filter_genes
table(filter_genes)
# PCA raw counts
plotPCA(sce, exprs_values = "counts",
colour_by = "batch", shape_by = "individual")
# PCA log counts
plotPCA(sce, exprs_values = "logcounts_raw",
colour_by = "batch", shape_by = "individual")
#find first 2 PCs
pca <- reducedDim(sce, "PCA")[, 1:2]
#create cdata
cdata <- data.frame(PC1 = pca[, 1],
libsize = sce$total_counts,
batch = sce$batch)
#plot pc1 versus libsize
ggplot(cdata, aes(x = PC1, y = libsize, col = batch)) +
geom_point()
# load scran
library(scran)
# find size factors
sce <- computeSumFactors(sce)
# display size factor histogram
hist(sizeFactors(sce))
# view assays
assays(sce)
# normalize sce
normalized_sce <- normalize(sce)
# view new assay for normalized logcounts
assays(normalized_sce)
Chapter 3 - Visualization and Dimensionality Reduction
Mouse Epithelium Dataset:
Visualization:
Dimensionality Reduction:
geom_point() Example code includes:
# find dimensions
mydims <- dim(sce)
# extract cell and gene names
cellNames <- colnames(sce)
geneNames <- rownames(sce)
# cell data
cData <- colData(sce)
#print column names
colnames(cData)
# table batch & clusters
cData <- cData[, c('Batch', 'publishedClusters')]
#tabulate cData
table(cData)
# load scater
library(scater)
# plot pc1 and pc2 counts
plotPCA(
object = sce,
exprs_values = "counts",
shape_by = "Batch",
colour_by = "publishedClusters"
)
# explore initial assays
assays(sce)
# create log counts
logcounts <- log1p(assays(sce)$counts)
# add log counts
assay(sce, 'logcounts') <- logcounts
assays(sce)
# pca log counts
plotPCA(object = sce, exprs_values = "logcounts",
shape_by = "Batch", colour_by = "publishedClusters")
# default tSNE
plotTSNE(
sce,
exprs_values = "counts",
shape_by = "publishedClusters",
colour_by = "Batch",
perplexity = 5
)
# gene variance
vars <- assay(sce) %>% log1p() %>% rowVars()
#rename vars
names(vars) <- rownames(sce)
#sort vars
vars_2 <- sort(vars, decreasing = TRUE)
head(vars_2)
# subset sce
sce_sub <- sce[names(vars[1:50]), ]
sce_sub
# log counts
logcounts <- log1p(assays(sce_sub)$counts)
# transpose
tlogcounts <- t(logcounts)
# perform pca
pca <- prcomp(tlogcounts)
# store pca matrix in sce
reducedDims(sce_sub) <- SimpleList(PCA = pca$x)
head(reducedDim(sce_sub, "PCA")[, 1:2])
# Extract PC1 and PC2 and create a data frame
pca <- reducedDim(sce_sub, "PCA")[, 1:2]
col_shape <- data.frame(publishedClusters = colData(sce)$publishedClusters, Batch = factor(colData(sce)$Batch))
df <- cbind(pca, col_shape)
# plot PC1, PC2
ggplot(df, aes(x = PC1, y = PC2,
colour = publishedClusters,
shape = Batch)) +
geom_point()
Chapter 4 - Cell Clustering and Differential Expression Analysis
Clustering methods for scRNA-Seq:
raw.data = assay(sce), normalization.method = "LogNormalize", scale.factor = 10000, meta.data = as.data.frame(colData(sce)) Differential expression analysis:
Pr(>Chisq))], fit[contrast==‘celltype9’ & component==‘logFC’, .(primerid, coef)], by=‘primerid’)Pr(>Chisq), ‘fdr’)]Visualization of DE genes:
ggtitle("Volcano") + xlab("log2 FC") + ylab("-log10 adjusted p-value") Example code includes:
# load Seurat
library(Seurat)
#create seurat object
seuset <- CreateSeuratObject(
raw.data = assay(sce),
normalization.method = "LogNormalize",
scale.factor = 10000,
meta.data = as.data.frame(colData(sce))
)
# scale seuset object
scaled_seuset <- ScaleData(object = seuset)
# perform pca
seuset <- RunPCA(
object = seuset,
pc.genes = rownames(seuset@raw.data),
do.print = FALSE
)
# plot pca
PCAPlot(object = seuset,
pt.shape = 'Batch',
group.by = 'publishedClusters')
# load MAST
library(MAST)
# SingleCellAssay object
sca
# fit zero-inflated regression
zlm <- zlm(~ celltype + cngeneson, sca)
# summary with likelihood test ratio
summary <- summary(zlm, doLRT = "celltype9")
# get summary table
fit <- summary$datatable
# pvalue df
pvalue <- fit[contrast == 'celltype9' & component == 'H', .(primerid, `Pr(>Chisq)`)]
# logFC df
logFC <- fit[contrast == 'celltype9' & component == 'logFC', .(primerid, coef)]
# pvalues and logFC
fit <- merge(pvalue, logFC, by = 'primerid')
# adjusted pvalues
fit[, padjusted:=p.adjust(`Pr(>Chisq)`, 'fdr')]
# result table
res <- data.frame(gene = fit$primerid,
pvalue = fit[,'Pr(>Chisq)'],
padjusted = fit$padj,
logFC = fit$coef)
# most DE
res <- res[order(res$padjusted), ]
mostDE <- res$gene[1:20]
res$mostDE <- res$gene %in% mostDE
# volcano plot
ggplot(res, aes(x=logFC, y=-log10(padjusted), color=mostDE)) +
geom_point() +
ggtitle("Volcano plot") +
xlab("log2 fold change") +
ylab("-log10 adjusted p-value")
# load NMF
library(NMF)
# normalize log counts
norm <- assay(sce[mostDE, ], "logcounts")
mat <- as.matrix(norm)
# heatmap
aheatmap(mat, annCol = colData(sce)$publishedClusters)
Chapter 1 - Differential Expression Analysis
Differential expression analysis:
Differential expression data:
ExpressionSet class:
The limma package:
Example code includes:
# Create a boxplot of the first gene in the expression matrix
boxplot(x[1, ] ~ p[, "Disease"], main = f[1, "symbol"])
# Load package
library(Biobase)
# Create ExpressionSet object
eset <- ExpressionSet(assayData = x,
phenoData = AnnotatedDataFrame(p),
featureData = AnnotatedDataFrame(f))
# View the number of features (rows) and samples (columns)
dim(eset)
# Subset to only include the 1000th gene (row) and the first 10 samples
eset_sub <- eset[1000, 1:10]
# Check the dimensions of the subset
dim(eset_sub)
# Create a boxplot of the first gene in eset_sub
boxplot(exprs(eset_sub)[1, ] ~ pData(eset_sub)[, "Disease"],
main = fData(eset_sub)[1, "symbol"])
# Create design matrix for leukemia study
design <- model.matrix(~Disease, data = pData(eset))
# Count the number of samples modeled by each coefficient
colSums(design)
# Load package
library(limma)
# Fit the model
fit <- lmFit(eset, design)
# Calculate the t-statistics
fit <- eBayes(fit)
# Summarize results
results <- decideTests(fit[, "Diseasestable"])
summary(results)
Chapter 2 - Flexible Models for Common Study Designs
Flexible linear models:
Studies with more than two groups:
Factorial experimental design:
Example code includes:
# Create design matrix with no intercept
design <- model.matrix(~0 + Disease, data = pData(eset))
# Count the number of samples modeled by each coefficient
colSums(design)
# Load package
library(limma)
# Create a contrasts matrix
cm <- makeContrasts(status = Diseaseprogres. - Diseasestable, levels = design)
# View the contrasts matrix
cm
# Load package
library(limma)
# Fit the model
fit <- lmFit(eset, design)
# Fit the contrasts
fit2 <- contrasts.fit(fit, contrasts = cm)
# Calculate the t-statistics for the contrasts
fit2 <- eBayes(fit2)
# Summarize results
results <- decideTests(fit2)
summary(results)
# Create design matrix with no intercept
design <- model.matrix(~0 + oxygen, data = pData(eset))
# Count the number of samples modeled by each coefficient
colSums(design)
# Load package
library(limma)
# Create a contrasts matrix
cm <- makeContrasts(ox05vox01 = oxygenox05 - oxygenox01,
ox21vox01 = oxygenox21 - oxygenox01,
ox21vox05 = oxygenox21 - oxygenox05,
levels = design)
# View the contrasts matrix
cm
# Load package
library(limma)
# Fit the model
fit <- lmFit(eset, design)
# Fit the contrasts
fit2 <- contrasts.fit(fit, contrasts = cm)
# Calculate the t-statistics for the contrasts
fit2 <- eBayes(fit2)
# Summarize results
results <- decideTests(fit2)
summary(results)
# Create single variable
group <- with(pData(eset), paste(type, water, sep = "."))
group <- factor(group)
# Create design matrix with no intercept
design <- model.matrix(~0 + group)
colnames(design) <- levels(group)
# Count the number of samples modeled by each coefficient
colSums(design)
# Load package
library(limma)
# Create a contrasts matrix
cm <- makeContrasts(type_normal = nm6.normal - dn34.normal,
type_drought = nm6.drought - dn34.drought,
water_nm6 = nm6.drought - nm6.normal,
water_dn34 = dn34.drought - dn34.normal,
interaction = (nm6.drought - nm6.normal) - (dn34.drought - dn34.normal),
levels = design)
# View the contrasts matrix
cm
# Load package
library(limma)
# Fit the model
fit <- lmFit(eset, design)
# Fit the contrasts
fit2 <- contrasts.fit(fit, contrasts = cm)
# Calculate the t-statistics for the contrasts
fit2 <- eBayes(fit2)
# Summarize results
results <- decideTests(fit2)
summary(results)
Chapter 3 - Pre-processing and post-processing
Normalizing and filtering:
Accounting for technical batch effects:
Visualizing results:
Enrichment testing:
Example code includes:
# Load package
library(limma)
# View the distribution of the raw data
plotDensities(eset, legend = FALSE)
# Log tranform
exprs(eset) <- log(exprs(eset))
plotDensities(eset, legend = FALSE)
# Quantile normalize
exprs(eset) <- normalizeBetweenArrays(exprs(eset))
plotDensities(eset, legend = FALSE)
# Load package
library(limma)
# View the normalized gene expression levels
plotDensities(eset, legend = FALSE); abline(v = 5)
# Determine the genes with mean expression level greater than 5
keep <- rowMeans(exprs(eset)) > 5
sum(keep)
# Filter the genes
eset <- eset[keep, ]
plotDensities(eset, legend = FALSE)
# Load package
library(limma)
# Plot principal components labeled by treatment
plotMDS(eset, labels = pData(eset)[, "treatment"], gene.selection = "common")
# Plot principal components labeled by batch
plotMDS(eset, labels = pData(eset)[, "batch"], gene.selection = "common")
# Load package
library(limma)
# Remove the batch effect
exprs(eset) <- removeBatchEffect(eset, batch = pData(eset)[, "batch"])
# Plot principal components labeled by treatment
plotMDS(eset, labels = pData(eset)[, "treatment"], gene.selection = "common")
# Plot principal components labeled by batch
plotMDS(eset, labels = pData(eset)[, "batch"], gene.selection = "common")
# Obtain the summary statistics for every gene
stats <- topTable(fit2, number = nrow(fit2), sort.by = "none")
# Plot a histogram of the p-values
hist(stats[, "P.Value"])
# Create a volcano plot. Highlight the top 5 genes
volcanoplot(fit2, highlight = 5, names = fit2$genes$symbol)
# Extract the entrez gene IDs
entrez <- fit2$genes[, "entrez"]
# Test for enriched KEGG Pathways
enrich_kegg <- kegga(fit2, geneid = entrez, species = "Hs")
# View the top 20 enriched KEGG pathways
topKEGG(enrich_kegg, number=20)
# Extract the entrez gene IDs
entrez <- fit2$genes[, "entrez"]
# Test for enriched GO categories
enrich_go <- goana(fit2, geneid = entrez, species = "Hs")
# View the top 20 enriched GO Biological Processes
topGO(enrich_go, ontology = "BP", number=20)
Chapter 4 - Case Study: Effect of Doxorubicin Treatment
Pre-process data:
Model the data:
Inspect the results:
Wrap up:
Example code includes:
# Log transform
exprs(eset) <- log(exprs(eset))
plotDensities(eset, group = pData(eset)[, "genotype"], legend = "topright")
# Quantile normalize
exprs(eset) <- normalizeBetweenArrays(exprs(eset))
plotDensities(eset, group = pData(eset)[, "genotype"], legend = "topright")
# Determine the genes with mean expression level greater than 0
keep <- rowMeans(exprs(eset)) > 0
sum(keep)
# Filter the genes
eset <- eset[keep, ]
plotDensities(eset, group = pData(eset)[, "genotype"], legend = "topright")
# Find the row which contains Top2b expression data
top2b <- which(fData(eset)["symbol"] == "Top2b")
# Plot Top2b expression versus genotype
boxplot(exprs(eset)[top2b, ] ~ pData(eset)[, "genotype"], main = fData(eset)[top2b, ])
# Plot principal components labeled by genotype
plotMDS(eset, labels = pData(eset)[, "genotype"], gene.selection = "common")
# Plot principal components labeled by treatment
plotMDS(eset, labels = pData(eset)[, "treatment"], gene.selection = "common")
# Create single variable
group <- with(pData(eset), paste(genotype, treatment, sep = "."))
group <- factor(group)
# Create design matrix with no intercept
design <- model.matrix(~0 + group)
colnames(design) <- levels(group)
# Count the number of samples modeled by each coefficient
colSums(design)
# Create a contrasts matrix
cm <- makeContrasts(dox_wt = wt.dox - wt.pbs,
dox_top2b = top2b.dox - top2b.pbs,
interaction = (top2b.dox - top2b.pbs) - (wt.dox - wt.pbs),
levels = design)
# View the contrasts matrix
cm
# Fit the model
fit <- lmFit(eset, design)
# Fit the contrasts
fit2 <- contrasts.fit(fit, contrasts = cm)
# Calculate the t-statistics for the contrasts
fit2 <- eBayes(fit2)
# Summarize results
results <- decideTests(fit2)
summary(results)
# Create a Venn diagram
vennDiagram(results)
# Obtain the summary statistics for the contrast dox_wt
stats_dox_wt <- topTable(fit2, coef = "dox_wt", number = nrow(fit2), sort.by = "none")
# Obtain the summary statistics for the contrast dox_top2b
stats_dox_top2b <- topTable(fit2, coef = "dox_top2b", number = nrow(fit2), sort.by = "none")
# Obtain the summary statistics for the contrast interaction
stats_interaction <- topTable(fit2, coef = "interaction", number = nrow(fit2), sort.by = "none")
# Create histograms of the p-values for each contrast
hist(stats_dox_wt[, "P.Value"])
hist(stats_dox_top2b[, "P.Value"])
hist(stats_interaction[, "P.Value"])
# Extract the gene symbols
gene_symbols <- fit2$genes[, "symbol"]
# Create a volcano plot for the contrast dox_wt
volcanoplot(fit2, coef = "dox_wt", highlight = 5, names = gene_symbols)
# Create a volcano plot for the contrast dox_top2b
volcanoplot(fit2, coef = "dox_top2b", highlight = 5, names = gene_symbols)
# Create a volcano plot for the contrast interaction
volcanoplot(fit2, coef = "interaction", highlight = 5, names = gene_symbols)
# Extract the entrez gene IDs
entrez <- fit2$genes[, "entrez"]
# Test for enriched KEGG Pathways for contrast dox_wt
enrich_dox_wt <- kegga(fit2, coef = "dox_wt", geneid = entrez, species = "Mm")
# View the top 5 enriched KEGG pathways
topKEGG(enrich_dox_wt, number = 5)
# Test for enriched KEGG Pathways for contrast interaction
enrich_interaction <- kegga(fit2, coef = "interaction", geneid = entrez, species = "Mm")
# View the top 5 enriched KEGG pathways
topKEGG(enrich_interaction, number = 5)
Chapter 1 - rbokeh Introduction
Getting started with rbokeh:
Layers for rbokeh:
Layers for rbokeh (continued):
Example code includes:
## load rbokeh, gapminder and dplyr libraries
library(rbokeh)
library(gapminder)
library(dplyr)
## explore gapminder dataset
str(gapminder)
## filter gapminder data by year 1982
dat_1982 <- gapminder %>% filter(year == 1982)
## plot life expectancy Vs GDP per Capita using data_1982
figure(legend_location = "bottom_right", title = "Life Expectancy Vs. GDP per Capita in 1982") %>%
ly_points(x = gdpPercap, y = lifeExp, data = dat_1982,
color = continent, hover = c(continent, country, pop)
)
## filter the dataset for the continent Africa and and year 1967
data_africa <- gapminder %>%
filter(year==1967, continent=="Africa")
## view data_africa
data_africa
## plot life expectancy Vs GDP per Capita using data_africa
figure(legend_location = "bottom_right",
title = "Life Expectancy Vs. GDP per Capita in Africa - 1967"
) %>%
ly_points(x = gdpPercap, y = lifeExp, data = data_africa, hover = c(country, pop))
## add a new column with gdp in millions
gapminder_mill <- gapminder %>%
mutate(gdp_millions = gdpPercap * pop / 10^6)
## view the first 6 entries in gapminder after adding gdp_millions
head(gapminder_mill)
## extract the entries for "Rwanda"
data_rwanda <- gapminder_mill %>%
filter(country=="Rwanda")
## explore data_rwanda
data_rwanda
## plot gdp over time
figure(data = data_rwanda) %>%
ly_lines(x = year, y = gdp_millions, width = 2)
## explore the economics dataset
data(economics)
str(economics)
## pass vectors to x & y
figure() %>%
ly_lines(x = economics$date, y = economics$pce)
## pass columns names and dataframe
figure() %>%
ly_lines(x = date, y = pce, data = economics)
## plot unemployment rate versus time and change the default `ylab`
figure(ylab = "unemployment %") %>%
ly_lines(x=date, y=100*unemploy/pop, data=economics)
dat_1992 <- gapminder %>%
filter(year==1992)
str(dat_1992)
## plot lifeExp Vs. gdpPercap using rbokeh
plot_1992<- figure(legend_location = "bottom_right") %>%
ly_points(x=gdpPercap, y=lifeExp, color=continent, data=dat_1992)
## show the plot
plot_1992
data_countries <- gapminder %>%
filter(country %in% c("United Kingdom", "Australia", "Canada", "United States", "New Zealand"))
str(data_countries)
figure(data = data_countries, legend="top_left") %>%
ly_lines(x = year, y = gdpPercap , color = country) %>%
ly_points(x=year, y=gdpPercap, color=country)
data_countries <- gapminder %>%
filter(country %in% c("China", "India"))
## create a line plot with lifeExp vs. year
fig_countries <- figure(legend="top_left") %>%
ly_lines(x=year, y=lifeExp, color=country, data=data_countries)
## View fig_countries
fig_countries
## modify fig_countries by adding a points layer with gdpPercap vs. year
fig_countries %>%
ly_points(x=year, y=lifeExp, color=country, data=data_countries)
Chapter 2 - rbokeh Aesthetic Attributes and Figure Options
Plot and Managed Attributes (Part I):
Plot and Managed Attributes (Part II):
Hover Info and Figure Options:
Example code includes:
hdiRaw <- read.csv("./RInputFiles/Human Development Index (HDI).csv", skip=1)
str(hdiRaw)
hdi_data <- hdiRaw %>%
gather(key="year", value="human_development_index", -Country, -`HDI.Rank..2017.`) %>%
mutate(country=str_trim(as.character(Country)), year=as.integer(str_sub(year, 2))) %>%
filter(year %in% 1990:2105) %>%
select(country, year, human_development_index)
str(hdi_data)
## extract "Namibia" and "Botswana" entries from hdi_data
hdi_countries <- hdi_data %>%
filter(country %in% c("Namibia", "Botswana"))
## plot human_development_index versus year
fig_col <- figure(data = hdi_countries, legend_location = "bottom_right") %>%
ly_lines(x = year, y = human_development_index, color = country) %>%
ly_points(x = year, y = human_development_index,
fill_color = "white", fill_alpha = 1,
line_color = country, line_alpha = 1,
size = 4
)
## view plot
fig_col
## use a custom palette with colors "green", "red"
fig_col %>%
set_palette(discrete_color = pal_color(c("green", "red")))
## define custom palette
custom_pal <- pal_color(c("#c51b8a", "#31a354"))
## use custom_pal yp modify fig_col
fig_col %>%
set_palette(discrete_color=custom_pal)
## explore bechdel dataset using str
data(bechdel, package="fivethirtyeight")
str(bechdel)
## extract entries between 1980 - 2013
dat_80_13 <- bechdel %>%
filter(between(year, 1980, 2013))
dat_80_13 <- dat_80_13 %>%
mutate(roi_total = intgross_2013 / budget_2013)
## plot
figure() %>%
ly_points(x=log(budget_2013), y=log(roi_total), data=dat_80_13)
## plot log(roi_total) versus log(budget_2013)
figure() %>%
ly_points(x=log(budget_2013), y=log(roi_total), size=5, line_alpha=0, fill_alpha=0.3, data=dat_80_13)
## filter the data by country = Syrian Arab Republic
hdi_countries <- hdi_data %>%
filter(country %in% c("Syrian Arab Republic", "Morocco"))
## change the color and line width
figure(title = "Human Development Index over Time", legend = "bottom_right") %>%
ly_lines(x=year, y=human_development_index, color=country, width=3, data=hdi_countries)
# explore hdi_cpi_data dataset
# str(hdi_cpi_2015)
## add multiple values as hover info (country, cpi_rank)
# figure(legend_location = "bottom_right") %>%
# ly_points(x=corruption_perception_index, y=human_development_index, color=continent, hover=c(country, cpi_rank), size=6, data=hdi_cpi_2015)
## modify the figure theme
# figure(title = "Corruption Perception Index Vs. Human Development Index 2015",
# legend_location = "bottom_right", xgrid = FALSE, ygrid = FALSE,
# xlab = "CPI", ylab = "HDI", theme=bk_ggplot_theme()) %>%
# ly_points(x = corruption_perception_index, y = human_development_index,
# data = hdi_cpi_2015, color = continent, size = 6, hover = c(country, cpi_rank)
# )
Chapter 3 - Data Manipulation for Visualization and More rbokeh Layers
Data Formats:
More rbokeh Layers:
Interaction Tools:
Example code includes:
ctry <- c('Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina', 'Australia', 'Austria', 'Bahrain', 'Bangladesh', 'Belgium', 'Benin', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Costa Rica', 'Croatia', 'Cuba', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Eritrea', 'Ethiopia', 'Finland', 'France', 'Gabon', 'Gambia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Haiti', 'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kenya', 'Kuwait', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Madagascar', 'Malawi', 'Malaysia', 'Mali', 'Mauritania', 'Mauritius', 'Mexico', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Norway', 'Oman', 'Pakistan', 'Panama', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Romania', 'Rwanda', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Sierra Leone', 'Singapore', 'Slovenia', 'South Africa', 'Spain', 'Sri Lanka', 'Sudan', 'Sweden', 'Switzerland', 'Thailand', 'Togo', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Uganda', 'United Kingdom', 'United States', 'Uruguay', 'Zambia', 'Zimbabwe', 'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina', 'Australia', 'Austria', 'Bahrain', 'Bangladesh', 'Belgium', 'Benin', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Costa Rica', 'Croatia', 'Cuba', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Eritrea', 'Ethiopia', 'Finland', 'France', 'Gabon', 'Gambia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Haiti', 'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kenya', 'Kuwait', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Madagascar', 'Malawi', 'Malaysia', 'Mali', 'Mauritania', 'Mauritius', 'Mexico', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Norway', 'Oman', 'Pakistan', 'Panama', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Romania', 'Rwanda', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Sierra Leone', 'Singapore', 'Slovenia', 'South Africa', 'Spain', 'Sri Lanka', 'Sudan', 'Sweden', 'Switzerland', 'Thailand', 'Togo', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Uganda', 'United Kingdom', 'United States', 'Uruguay', 'Zambia', 'Zimbabwe')
ctryCode <- c('AFG', 'ALB', 'DZA', 'AGO', 'ARG', 'AUS', 'AUT', 'BHR', 'BGD', 'BEL', 'BEN', 'BIH', 'BWA', 'BRA', 'BGR', 'BFA', 'BDI', 'KHM', 'CMR', 'CAN', 'CAF', 'TCD', 'CHL', 'CHN', 'COL', 'COM', 'CRI', 'HRV', 'CUB', 'CZE', 'DNK', 'DJI', 'DOM', 'ECU', 'EGY', 'SLV', 'ERI', 'ETH', 'FIN', 'FRA', 'GAB', 'GMB', 'DEU', 'GHA', 'GRC', 'GTM', 'GIN', 'GNB', 'HTI', 'HND', 'HUN', 'ISL', 'IND', 'IDN', 'IRQ', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR', 'KEN', 'KWT', 'LBN', 'LSO', 'LBR', 'LBY', 'MDG', 'MWI', 'MYS', 'MLI', 'MRT', 'MUS', 'MEX', 'MNG', 'MON', 'MAR', 'MOZ', 'MMR', 'NAM', 'NPL', 'NLD', 'NZL', 'NIC', 'NER', 'NGA', 'NOR', 'OMN', 'PAK', 'PAN', 'PRY', 'PER', 'PHL', 'POL', 'PRT', 'ROM', 'RWA', 'STP', 'SAU', 'SEN', 'SCG', 'SLE', 'SGP', 'SVN', 'ZAF', 'ESP', 'LKA', 'SDN', 'SWE', 'CHE', 'THA', 'TGO', 'TTO', 'TUN', 'TUR', 'UGA', 'GBR', 'USA', 'URY', 'ZMB', 'ZWE', 'AFG', 'ALB', 'DZA', 'AGO', 'ARG', 'AUS', 'AUT', 'BHR', 'BGD', 'BEL', 'BEN', 'BIH', 'BWA', 'BRA', 'BGR', 'BFA', 'BDI', 'KHM', 'CMR', 'CAN', 'CAF', 'TCD', 'CHL', 'CHN', 'COL', 'COM', 'CRI', 'HRV', 'CUB', 'CZE', 'DNK', 'DJI', 'DOM', 'ECU', 'EGY', 'SLV', 'ERI', 'ETH', 'FIN', 'FRA', 'GAB', 'GMB', 'DEU', 'GHA', 'GRC', 'GTM', 'GIN', 'GNB', 'HTI', 'HND', 'HUN', 'ISL', 'IND', 'IDN', 'IRQ', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR', 'KEN', 'KWT', 'LBN', 'LSO', 'LBR', 'LBY', 'MDG', 'MWI', 'MYS', 'MLI', 'MRT', 'MUS', 'MEX', 'MNG', 'MON', 'MAR', 'MOZ', 'MMR', 'NAM', 'NPL', 'NLD', 'NZL', 'NIC', 'NER', 'NGA', 'NOR', 'OMN', 'PAK', 'PAN', 'PRY', 'PER', 'PHL', 'POL', 'PRT', 'ROM', 'RWA', 'STP', 'SAU', 'SEN', 'SCG', 'SLE', 'SGP', 'SVN', 'ZAF', 'ESP', 'LKA', 'SDN', 'SWE', 'CHE', 'THA', 'TGO', 'TTO', 'TUN', 'TUR', 'UGA', 'GBR', 'USA', 'URY', 'ZMB', 'ZWE')
regn <- c('AP', 'ECA', 'MENA', 'SSA', 'AME', 'AP', 'WE/EU', 'MENA', 'AP', 'WE/EU', 'SSA', 'ECA', 'SSA', 'AME', 'WE/EU', 'SSA', 'SSA', 'AP', 'SSA', 'AME', 'SSA', 'SSA', 'AME', 'AP', 'AME', 'SSA', 'AME', 'WE/EU', 'AME', 'WE/EU', 'WE/EU', 'SSA', 'AME', 'AME', 'MENA', 'AME', 'SSA', 'SSA', 'WE/EU', 'WE/EU', 'SSA', 'SSA', 'WE/EU', 'SSA', 'WE/EU', 'AME', 'SSA', 'SSA', 'AME', 'AME', 'WE/EU', 'WE/EU', 'AP', 'AP', 'MENA', 'WE/EU', 'MENA', 'WE/EU', 'AME', 'AP', 'MENA', 'SSA', 'MENA', 'MENA', 'SSA', 'SSA', 'MENA', 'SSA', 'SSA', 'AP', 'SSA', 'SSA', 'SSA', 'AME', 'AP', 'ECA', 'MENA', 'SSA', 'AP', 'SSA', 'AP', 'WE/EU', 'AP', 'AME', 'SSA', 'SSA', 'WE/EU', 'MENA', 'AP', 'AME', 'AME', 'AME', 'AP', 'WE/EU', 'WE/EU', 'WE/EU', 'SSA', 'SSA', 'MENA', 'SSA', 'ECA', 'SSA', 'AP', 'WE/EU', 'SSA', 'WE/EU', 'AP', 'MENA', 'WE/EU', 'WE/EU', 'AP', 'SSA', 'AME', 'MENA', 'ECA', 'SSA', 'WE/EU', 'AME', 'AME', 'SSA', 'SSA', 'AP', 'ECA', 'MENA', 'SSA', 'AME', 'AP', 'WE/EU', 'MENA', 'AP', 'WE/EU', 'SSA', 'ECA', 'SSA', 'AME', 'WE/EU', 'SSA', 'SSA', 'AP', 'SSA', 'AME', 'SSA', 'SSA', 'AME', 'AP', 'AME', 'SSA', 'AME', 'WE/EU', 'AME', 'WE/EU', 'WE/EU', 'SSA', 'AME', 'AME', 'MENA', 'AME', 'SSA', 'SSA', 'WE/EU', 'WE/EU', 'SSA', 'SSA', 'WE/EU', 'SSA', 'WE/EU', 'AME', 'SSA', 'SSA', 'AME', 'AME', 'WE/EU', 'WE/EU', 'AP', 'AP', 'MENA', 'WE/EU', 'MENA', 'WE/EU', 'AME', 'AP', 'MENA', 'SSA', 'MENA', 'MENA', 'SSA', 'SSA', 'MENA', 'SSA', 'SSA', 'AP', 'SSA', 'SSA', 'SSA', 'AME', 'AP', 'ECA', 'MENA', 'SSA', 'AP', 'SSA', 'AP', 'WE/EU', 'AP', 'AME', 'SSA', 'SSA', 'WE/EU', 'MENA', 'AP', 'AME', 'AME', 'AME', 'AP', 'WE/EU', 'WE/EU', 'WE/EU', 'SSA', 'SSA', 'MENA', 'SSA', 'ECA', 'SSA', 'AP', 'WE/EU', 'SSA', 'WE/EU', 'AP', 'MENA', 'WE/EU', 'WE/EU', 'AP', 'SSA', 'AME', 'MENA', 'ECA', 'SSA', 'WE/EU', 'AME', 'AME', 'SSA', 'SSA')
cnt <- c('Asia', 'Europe', 'Africa', 'Africa', 'Americas', 'Oceania', 'Europe', 'Asia', 'Asia', 'Europe', 'Africa', 'Europe', 'Africa', 'Americas', 'Europe', 'Africa', 'Africa', 'Asia', 'Africa', 'Americas', 'Africa', 'Africa', 'Americas', 'Asia', 'Americas', 'Africa', 'Americas', 'Europe', 'Americas', 'Europe', 'Europe', 'Africa', 'Americas', 'Americas', 'Africa', 'Americas', 'Africa', 'Africa', 'Europe', 'Europe', 'Africa', 'Africa', 'Europe', 'Africa', 'Europe', 'Americas', 'Africa', 'Africa', 'Americas', 'Americas', 'Europe', 'Europe', 'Asia', 'Asia', 'Asia', 'Europe', 'Asia', 'Europe', 'Americas', 'Asia', 'Asia', 'Africa', 'Asia', 'Asia', 'Africa', 'Africa', 'Africa', 'Africa', 'Africa', 'Asia', 'Africa', 'Africa', 'Africa', 'Americas', 'Asia', 'Europe', 'Africa', 'Africa', 'Asia', 'Africa', 'Asia', 'Europe', 'Oceania', 'Americas', 'Africa', 'Africa', 'Europe', 'Asia', 'Asia', 'Americas', 'Americas', 'Americas', 'Asia', 'Europe', 'Europe', 'Europe', 'Africa', 'Africa', 'Asia', 'Africa', 'Europe', 'Africa', 'Asia', 'Europe', 'Africa', 'Europe', 'Asia', 'Africa', 'Europe', 'Europe', 'Asia', 'Africa', 'Americas', 'Africa', 'Europe', 'Africa', 'Europe', 'Americas', 'Americas', 'Africa', 'Africa', 'Asia', 'Europe', 'Africa', 'Africa', 'Americas', 'Oceania', 'Europe', 'Asia', 'Asia', 'Europe', 'Africa', 'Europe', 'Africa', 'Americas', 'Europe', 'Africa', 'Africa', 'Asia', 'Africa', 'Americas', 'Africa', 'Africa', 'Americas', 'Asia', 'Americas', 'Africa', 'Americas', 'Europe', 'Americas', 'Europe', 'Europe', 'Africa', 'Americas', 'Americas', 'Africa', 'Americas', 'Africa', 'Africa', 'Europe', 'Europe', 'Africa', 'Africa', 'Europe', 'Africa', 'Europe', 'Americas', 'Africa', 'Africa', 'Americas', 'Americas', 'Europe', 'Europe', 'Asia', 'Asia', 'Asia', 'Europe', 'Asia', 'Europe', 'Americas', 'Asia', 'Asia', 'Africa', 'Asia', 'Asia', 'Africa', 'Africa', 'Africa', 'Africa', 'Africa', 'Asia', 'Africa', 'Africa', 'Africa', 'Americas', 'Asia', 'Europe', 'Africa', 'Africa', 'Asia', 'Africa', 'Asia', 'Europe', 'Oceania', 'Americas', 'Africa', 'Africa', 'Europe', 'Asia', 'Asia', 'Americas', 'Americas', 'Americas', 'Asia', 'Europe', 'Europe', 'Europe', 'Africa', 'Africa', 'Asia', 'Africa', 'Europe', 'Africa', 'Asia', 'Europe', 'Africa', 'Europe', 'Asia', 'Africa', 'Europe', 'Europe', 'Asia', 'Africa', 'Americas', 'Africa', 'Europe', 'Africa', 'Europe', 'Americas', 'Americas', 'Africa', 'Africa')
idx <- rep(c("corruption_perception_index", "human_development_index"), each=121)
cpiRk <- c(166, 88, 88, 163, 106, 13, 16, 50, 139, 15, 83, 76, 29, 76, 69, 76, 150, 150, 130, 10, 145, 147, 23, 83, 83, 136, 40, 50, 56, 38, 1, 98, 102, 106, 88, 72, 154, 102, 3, 23, 98, 123, 11, 56, 58, 123, 139, 158, 158, 111, 50, 13, 76, 88, 161, 18, 32, 61, 69, 18, 45, 139, 55, 123, 61, 83, 161, 123, 111, 54, 95, 111, 45, 111, 72, 61, 88, 111, 147, 45, 130, 9, 1, 130, 98, 136, 5, 60, 117, 72, 130, 88, 95, 29, 28, 58, 43, 66, 48, 61, 71, 119, 7, 34, 61, 37, 83, 165, 4, 6, 76, 106, 72, 76, 66, 139, 11, 16, 21, 76, 150, 166, 88, 88, 163, 106, 13, 16, 50, 139, 15, 83, 76, 29, 76, 69, 76, 150, 150, 130, 10, 145, 147, 23, 83, 83, 136, 40, 50, 56, 38, 1, 98, 102, 106, 88, 72, 154, 102, 3, 23, 98, 123, 11, 56, 58, 123, 139, 158, 158, 111, 50, 13, 76, 88, 161, 18, 32, 61, 69, 18, 45, 139, 55, 123, 61, 83, 161, 123, 111, 54, 95, 111, 45, 111, 72, 61, 88, 111, 147, 45, 130, 9, 1, 130, 98, 136, 5, 60, 117, 72, 130, 88, 95, 29, 28, 58, 43, 66, 48, 61, 71, 119, 7, 34, 61, 37, 83, 165, 4, 6, 76, 106, 72, 76, 66, 139, 11, 16, 21, 76, 150)
vl <- c(0.479, 0.764, 0.745, 0.533, 0.827, 0.939, 0.893, 0.824, 0.579, 0.896, 0.485, 0.75, 0.698, 0.754, 0.794, 0.402, 0.404, 0.563, 0.518, 0.92, 0.352, 0.396, 0.847, 0.738, 0.727, 0.498, 0.776, 0.827, 0.775, 0.878, 0.925, 0.473, 0.722, 0.739, 0.691, 0.68, 0.42, 0.448, 0.895, 0.897, 0.697, 0.452, 0.926, 0.579, 0.866, 0.64, 0.414, 0.424, 0.493, 0.625, 0.836, 0.921, 0.624, 0.689, 0.649, 0.923, 0.899, 0.887, 0.73, 0.903, 0.742, 0.555, 0.8, 0.763, 0.497, 0.427, 0.716, 0.512, 0.476, 0.789, 0.442, 0.513, 0.781, 0.762, 0.735, 0.807, 0.647, 0.418, 0.556, 0.64, 0.558, 0.924, 0.915, 0.645, 0.353, 0.527, 0.949, 0.796, 0.55, 0.788, 0.693, 0.74, 0.682, 0.855, 0.843, 0.802, 0.498, 0.574, 0.847, 0.494, 0.776, 0.42, 0.925, 0.89, 0.666, 0.884, 0.766, 0.49, 0.913, 0.939, 0.74, 0.487, 0.78, 0.725, 0.767, 0.493, 0.91, 0.92, 0.795, 0.579, 0.516, 11, 36, 36, 15, 32, 79, 76, 51, 25, 77, 37, 38, 63, 38, 41, 38, 21, 21, 27, 83, 24, 22, 70, 37, 37, 26, 55, 51, 47, 56, 91, 34, 33, 32, 36, 39, 18, 33, 90, 70, 34, 28, 81, 47, 46, 28, 25, 17, 17, 31, 51, 79, 38, 36, 16, 75, 61, 44, 41, 75, 53, 25, 49, 28, 44, 37, 16, 28, 31, 50, 35, 31, 53, 31, 39, 44, 36, 31, 22, 53, 27, 84, 91, 27, 34, 26, 88, 45, 30, 39, 27, 36, 35, 63, 64, 46, 54, 42, 52, 44, 40, 29, 85, 60, 44, 58, 37, 12, 89, 86, 38, 32, 39, 38, 42, 25, 81, 76, 74, 38, 21)
hdi_cpi_data_long <- data.frame(country=ctry, year=2015L, country_code=ctryCode, cpi_rank=cpiRk,
region=regn, continent=cnt, index=idx, value=vl,
stringsAsFactors = FALSE
)
## explore hdi_cpi_data_long
str(hdi_cpi_data_long)
## How many unique values are there in the index column?
unique(hdi_cpi_data_long$index)
## convert from long to wide
hdi_cpi_data_wide <- hdi_cpi_data_long %>%
spread(key=index, value=value)
## display the first 5 rows from hdi_cpi_data_wide
head(hdi_cpi_data_wide, 5)
## plot corruption_perception_index versus human_development_index
figure(legend_location = "top_left") %>%
ly_points(x=human_development_index, y=corruption_perception_index, color=continent, alpha=0.7,
hover=c(country, cpi_rank,corruption_perception_index, human_development_index),
data=hdi_cpi_data_wide
)
## convert from wide to long
hdi_cpi_remake_long <- hdi_cpi_data_wide %>%
gather(key="index", value="value", corruption_perception_index, human_development_index)
## display the first 5 rows of hdi_data_long
head(hdi_cpi_remake_long, 5)
all.equal(hdi_cpi_data_long, hdi_cpi_remake_long)
## explore the unique values in the movie_budget column
# unique(dat_90_13_long$movie_budget)
## spread the values in the `movie_budget` in two columns
# dat_90_13_wide <- dat_90_13_long %>%
# spread(key=movie_budget, value=value)
## View column names of dat_90_13_wide
# names(dat_90_13_wide)
## create a scatter plot with log(budget_2013) Vs log(intgross_2013)
# p_scatter <- figure() %>%
# ly_points(y=log(intgross_2013), x=log(budget_2013), size=4, alpha=0.5, data=dat_90_13_wide)
## View plot
# p_scatter
## fit a linear reg model
# lin_reg <- lm(log(intgross_2013) ~ log(budget_2013), data = dat_90_13)
## add the linear regression line layer to p_scatter
# p_scatter %>%
# ly_abline(lin_reg)
## extract entries for year 2007
dat_2007 <- gapminder %>%
filter(year == 2007)
dat_2002 <- gapminder %>%
filter(year == 2002)
## create scatter plot
figure(toolbar_location="above", legend_location="bottom_right") %>%
ly_points(x=gdpPercap, y=lifeExp, color=continent, size=6, alpha=0.7,
data=dat_2007, hover=c(country, lifeExp, gdpPercap)
)
figure(legend_location = "bottom_right", tools=c("resize", "save")) %>%
ly_points(x = gdpPercap, y = lifeExp, data = dat_2002, color = continent)
figure(legend_location = "bottom_right", tools=c("resize", "save"), toolbar_location=NULL) %>%
ly_points(x = gdpPercap, y = lifeExp, data = dat_2002, color = continent)
Chapter 4 - Grid Plots and Maps
Intro to Grid Plots:
Facets with Grid Plots:
rbokeh maps:
Example code includes:
tb <- data.frame(iso2="US",
gender=rep(c("m", "f"), each=84),
year=factor(c(1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008)),
age=c(1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65),
count=c(355, 333, 330, 321, 331, 365, 320, 343, 365, 362, 383, 388, 414, 375, 876, 815, 701, 663, 616, 602, 613, 562, 526, 547, 535, 568, 490, 513, 1417, 1219, 1127, 1009, 1011, 906, 824, 813, 754, 728, 666, 659, 572, 495, 1121, 1073, 979, 1007, 930, 904, 876, 795, 828, 829, 767, 759, 744, 725, 742, 678, 679, 628, 601, 577, 524, 490, 487, 504, 499, 531, 533, 526, 1099, 1007, 944, 914, 801, 738, 649, 592, 650, 582, 624, 596, 562, 561, 280, 289, 269, 269, 232, 246, 239, 233, 277, 265, 241, 257, 257, 220, 579, 487, 449, 425, 391, 376, 410, 423, 353, 339, 348, 384, 338, 329, 499, 478, 447, 424, 394, 349, 346, 362, 310, 302, 276, 263, 260, 269, 285, 279, 254, 267, 245, 253, 247, 255, 269, 252, 242, 212, 225, 224, 202, 217, 201, 179, 244, 152, 176, 167, 169, 166, 161, 146, 135, 172, 591, 541, 514, 492, 444, 396, 389, 370, 354, 344, 322, 303, 308, 300),
stringsAsFactors = FALSE
)
str(tb)
tb_2534 <- tb %>% filter(age==2534)
str(tb_2534)
## create a bar plot for age group tb_2534
bar_2534 <- figure() %>%
ly_bar(x=year, y=count, color=gender, data=tb_2534, hover=TRUE)
## View figure
bar_2534
## create a bar plot for age group tb_2534 with % on the y-axis
bar_2534_percent <- figure(ylab = "share") %>%
ly_bar(x = year, y = count, tb_2534, color = gender, hover = TRUE, position = "fill")
## View figure
bar_2534_percent
## create a list with bar_2534 and bar_2534_percent figures
fig_list <- list(bar_2534 = bar_2534, bar_2534_percent = bar_2534_percent)
## create a grid plot
grid_plot(fig_list, width=1000, height=400)
## create a grid plot with same axes limits
grid_plot(figs = fig_list, width = 1000, height = 400, same_axes=TRUE)
plot_line <- function(x){
figure() %>%
ly_lines(y = count, year, data = x, color = age, alpha = 1, width = 2)
}
## create two dataframes for female/male data
tb_female <- tb %>% filter(gender=="f")
tb_male <- tb %>% filter(gender=="m")
## create two plots using plot_line
fig_female <- plot_line(tb_female)
fig_male <- plot_line(tb_male)
## create figure list
fig_list <- list(female = fig_female, male = fig_male)
## plot the two figures in a grid
grid_plot(fig_list, width=1000, height=600, same_axes=TRUE)
## split tb data by gender
tb_split_gender <- split(tb, tb$gender)
## create a list of figures using lapply
fig_list <- lapply(tb_split_gender, FUN=plot_line)
## create a grid plot
grid_plot(fig_list, width=1000, height=600, same_axes=TRUE)
## define a function to create a bar plot with the number of tb cases over time
plot_bar <- function(x){
figure() %>%
ly_bar(y=count, x=year, data=x, color = gender, position = "dodge", hover=TRUE)
}
## split tb data by age
tb_split_age <- split(tb, tb$age)
## apply the function to the groups in tb_split_age
fig_list <- fig_list <- lapply(tb_split_age, plot_bar)
## create a grid plot
grid_plot(fig_list, width=600, height=900, nrow=3, same_axes=TRUE) %>%
theme_axis("x", major_label_orientation = 90)
## initialize a map for NY center
# ny_map <- gmap(lat=40.73306, lng=-73.97351, zoom=11, map_style=gmap_style("blue_water"))
# ny_map
## filter ny_bikedata to get the entries for day "2017-04-25"
# ny_bikedata_20170425 <- ny_bikedata %>% filter(trip_date==as.Date("2017-04-25"))
## add a points layer to ny_map
# ny_map %>%
# ly_points(y=station_latitude, x=station_longitude,
# size=8, fill_color=start_count, line_alpha=0,
# data=ny_bikedata_20170425, hover=c(station_name, start_count, end_count)
# )
## create a names list with the two figures
# fig_list <- list(map_weekend=map_weekend_20170423, map_weekday=map_weekday_20170425)
## create a grid plot with the 2 maps
# grid_plot(fig_list, width=860, height=420)
Chapter 1 - Mini Case Study in A/B Testing
Introduction:
Baseline conversion rates:
month(visit_date), y=conversion_rate)) + geom_point() + geom_line()Experimental design and power analysis:
Example code includes:
# Read in data
click_data <- readr::read_csv("./RInputFiles/click_data.csv")
## Parsed with column specification:
## cols(
## visit_date = col_date(format = ""),
## clicked_adopt_today = col_double()
## )
click_data
## # A tibble: 3,650 x 2
## visit_date clicked_adopt_today
## <date> <dbl>
## 1 2017-01-01 1
## 2 2017-01-02 1
## 3 2017-01-03 0
## 4 2017-01-04 1
## 5 2017-01-05 1
## 6 2017-01-06 0
## 7 2017-01-07 0
## 8 2017-01-08 0
## 9 2017-01-09 0
## 10 2017-01-10 0
## # ... with 3,640 more rows
# Find oldest and most recent date
min(click_data$visit_date)
## [1] "2017-01-01"
max(click_data$visit_date)
## [1] "2017-12-31"
# Calculate the mean conversion rate by day of the week
click_data %>%
group_by(weekdays(visit_date)) %>%
summarize(conversion_rate = mean(clicked_adopt_today))
## # A tibble: 7 x 2
## `weekdays(visit_date)` conversion_rate
## <chr> <dbl>
## 1 Friday 0.267
## 2 Monday 0.277
## 3 Saturday 0.256
## 4 Sunday 0.3
## 5 Thursday 0.271
## 6 Tuesday 0.271
## 7 Wednesday 0.298
# Calculate the mean conversion rate by week of the year
click_data %>%
group_by(lubridate::week(visit_date)) %>%
summarize(conversion_rate = mean(clicked_adopt_today))
## # A tibble: 53 x 2
## `lubridate::week(visit_date)` conversion_rate
## <dbl> <dbl>
## 1 1 0.229
## 2 2 0.243
## 3 3 0.171
## 4 4 0.129
## 5 5 0.157
## 6 6 0.186
## 7 7 0.257
## 8 8 0.171
## 9 9 0.186
## 10 10 0.2
## # ... with 43 more rows
# Compute conversion rate by week of the year
click_data_sum <- click_data %>%
mutate(weekOfYear = lubridate::week(visit_date)) %>%
group_by(weekOfYear) %>%
summarize(conversion_rate = mean(clicked_adopt_today))
# Build plot
ggplot(click_data_sum, aes(x = `weekOfYear`, y = conversion_rate)) +
geom_point() +
geom_line() +
scale_y_continuous(limits = c(0, 1), labels = scales::percent)
# Compute and look at sample size for experiment in August
total_sample_size <- powerMediation::SSizeLogisticBin(p1 = 0.54, p2 = 0.64,
B = 0.5, alpha = 0.05, power = 0.8
)
total_sample_size
## [1] 758
# Compute and look at sample size for experiment in August with 5% increase
total_sample_size <- powerMediation::SSizeLogisticBin(p1 = 0.54, p2 = 0.59,
B = 0.5, alpha = 0.05, power = 0.8
)
total_sample_size
## [1] 3085
Chapter 2 - Mini Case Study in A/B Testing - Part II
Analyzing Results:
Designing follow-up experiments:
Pre-follow-up-experiment assumptions:
Follow-up experiment assumptions:
mutate(month_text = month(visit_date, label = TRUE)) %>% group_by(month_text, condition) %>% summarize(conversion_rate = mean(clicked_adopt_today)) spread(condition, conversion_rate) %>% mutate(condition_diff = cat_hat - no_hat) Example code includes:
experiment_data <- read_csv("./RInputFiles/experiment_data.csv")
## Parsed with column specification:
## cols(
## visit_date = col_date(format = ""),
## condition = col_character(),
## clicked_adopt_today = col_double()
## )
experiment_data
## # A tibble: 588 x 3
## visit_date condition clicked_adopt_today
## <date> <chr> <dbl>
## 1 2018-01-01 control 0
## 2 2018-01-01 control 1
## 3 2018-01-01 control 0
## 4 2018-01-01 control 0
## 5 2018-01-01 test 0
## 6 2018-01-01 test 0
## 7 2018-01-01 test 1
## 8 2018-01-01 test 0
## 9 2018-01-01 test 0
## 10 2018-01-01 test 1
## # ... with 578 more rows
followup_experiment_data <- read_csv("./RInputFiles/eight_month_checkin_data.csv")
## Parsed with column specification:
## cols(
## visit_date = col_date(format = ""),
## condition = col_character(),
## clicked_adopt_today = col_double()
## )
followup_experiment_data
## # A tibble: 4,860 x 3
## visit_date condition clicked_adopt_today
## <date> <chr> <dbl>
## 1 2018-01-01 cat_hat 1
## 2 2018-01-01 cat_hat 1
## 3 2018-01-01 cat_hat 0
## 4 2018-01-01 cat_hat 0
## 5 2018-01-01 cat_hat 0
## 6 2018-01-01 cat_hat 0
## 7 2018-01-01 cat_hat 0
## 8 2018-01-01 cat_hat 0
## 9 2018-01-01 cat_hat 1
## 10 2018-01-01 no_hat 0
## # ... with 4,850 more rows
# Group and summarize data
experiment_data_clean_sum <- experiment_data %>%
group_by(condition, visit_date) %>%
summarize(conversion_rate = mean(clicked_adopt_today))
# Make plot of conversion rates over time
ggplot(experiment_data_clean_sum, aes(x = visit_date, y = conversion_rate,
color = condition, group = condition
)
) +
geom_point() +
geom_line()
# View summary of results
experiment_data %>%
group_by(condition) %>%
summarize(conversion_rate = mean(clicked_adopt_today))
## # A tibble: 2 x 2
## condition conversion_rate
## <chr> <dbl>
## 1 control 0.167
## 2 test 0.384
# Run logistic regression
experiment_results <- glm(clicked_adopt_today ~ condition, family = "binomial",
data = experiment_data
) %>%
broom::tidy()
experiment_results
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.61 0.156 -10.3 8.28e-25
## 2 conditiontest 1.14 0.197 5.77 7.73e- 9
# Run logistic regression power analysis
total_sample_size <- powerMediation::SSizeLogisticBin(p1 = 0.39, p2 = 0.59, B = 0.5,
alpha = 0.05, power = 0.8
)
total_sample_size
## [1] 194
# View conversion rates by condition
followup_experiment_data %>%
group_by(condition) %>%
summarize(conversion_rate = mean(clicked_adopt_today))
## # A tibble: 2 x 2
## condition conversion_rate
## <chr> <dbl>
## 1 cat_hat 0.459
## 2 no_hat 0.271
# Run logistic regression
followup_experiment_results <- glm(clicked_adopt_today ~ condition, family = "binomial",
data = followup_experiment_data
) %>%
broom::tidy()
followup_experiment_results
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.163 0.0407 -4.01 6.02e- 5
## 2 conditionno_hat -0.825 0.0611 -13.5 1.66e-41
# Compute monthly summary
eight_month_checkin_data_sum <- followup_experiment_data %>%
mutate(month_text = lubridate::month(visit_date, label = TRUE)) %>%
group_by(month_text, condition) %>%
summarize(conversion_rate = mean(clicked_adopt_today))
# Plot month-over-month results
ggplot(eight_month_checkin_data_sum, aes(x = month_text, y = conversion_rate,
color = condition, group = condition
)
) +
geom_point() +
geom_line()
# Plot monthly summary
ggplot(eight_month_checkin_data_sum, aes(x = month_text, y = conversion_rate,
color = condition, group = condition
)
) +
geom_point() +
geom_line() +
scale_y_continuous(limits = c(0, 1), labels = scales::percent) +
labs(x = "Month", y = "Conversion Rate")
# Compute difference over time
# no_hat_data_diff <- no_hat_data_sum %>%
# spread(year, conversion_rate) %>%
# mutate(year_diff = `2018` - `2017`)
# no_hat_data_diff
# Compute summary statistics
# mean(no_hat_data_diff$year_diff, na.rm = TRUE)
# sd(no_hat_data_diff$year_diff, na.rm = TRUE)
# Run power analysis for logistic regression
total_sample_size <- powerMediation::SSizeLogisticBin(p1 = 0.49, p2 = 0.64, B = 0.5,
alpha = 0.05, power = 0.8
)
total_sample_size
## [1] 341
# View summary of data
# followup_experiment_data_sep %>%
# group_by(condition) %>%
# summarize(conversion_rate=mean(clicked_adopt_today))
# Run logistic regression
# followup_experiment_sep_results <- glm(clicked_adopt_today ~ condition,
# family = "binomial",
# data = followup_experiment_data_sep
# ) %>%
# broom::tidy()
# followup_experiment_sep_results
Chapter 3 - Experimental Design in A/B Testing
A/B Testing Research Questions:
Assumptions and types of A/B testing:
Confounding variables?
Side effects:
Example code includes:
# Compute summary by month
viz_website_2017 %>%
group_by(month(visit_date)) %>%
summarize(article_conversion_rate = mean(clicked_article))
# Compute 'like' click summary by month
viz_website_2017_like_sum <- viz_website_2017 %>%
mutate(month = month(visit_date, label = TRUE)) %>%
group_by(month) %>%
summarize(like_conversion_rate = mean(clicked_like))
# Plot 'like' click summary by month
ggplot(viz_website_2017_like_sum,
aes(x = month, y = like_conversion_rate, group = 1)
) +
geom_point() +
geom_line() +
scale_y_continuous(limits = c(0, 1), labels = percent)
# Plot comparison of 'like'ing and 'sharing'ing an article
ggplot(viz_website_2017_like_share_sum,
aes(x = month, y = conversion_rate, color = action, group = action)
) +
geom_point() +
geom_line() +
scale_y_continuous(limits = c(0, 1), labels = percent)
# Compute conversion rates for A/A experiment
viz_website_2018_01_sum <- viz_website_2018_01 %>%
group_by(condition) %>%
summarize(like_conversion_rate = mean(clicked_like))
viz_website_2018_01_sum
# Plot conversion rates for two conditions
ggplot(viz_website_2018_01_sum, aes(x = condition, y = like_conversion_rate)) +
geom_bar(stat = "identity") +
scale_y_continuous(limits = c(0, 1), labels = percent)
# Run logistic regression
aa_experiment_results <- glm(clicked_like ~ condition, family = "binomial", data = viz_website_2018_01) %>%
broom::tidy()
aa_experiment_results
# Compute 'like' conversion rate by week and condition
viz_website_2018_02 %>%
mutate(week = week(visit_date)) %>%
group_by(week, condition) %>%
summarize(like_conversion_rate = mean(clicked_like))
# Compute 'like' conversion rate by if article published and condition
viz_website_2018_02 %>%
group_by(article_published, condition) %>%
summarize(like_conversion_rate = mean(clicked_like))
# Plot 'like' conversion rates by date for experiment
ggplot(viz_website_2018_02_sum,
aes(x = visit_date, y = like_conversion_rate, color = condition,
linetype = article_published, group = interaction(condition, article_published)
)
) +
geom_point() +
geom_line() +
geom_vline(xintercept = as.numeric(as.Date("2018-02-15"))) +
scale_y_continuous(limits = c(0, 0.3), labels = percent)
# Compute 'like' conversion rate and mean pageload time by day
viz_website_2018_03_sum <- viz_website_2018_03 %>%
group_by(visit_date, condition) %>%
summarize(mean_pageload_time = mean(pageload_time), like_conversion_rate = mean(clicked_like))
# Plot effect of 'like' conversion rate by pageload time
ggplot(viz_website_2018_03_sum, aes(x = mean_pageload_time, y = like_conversion_rate, color = condition)) +
geom_point()
# Plot 'like' conversion rate by day
ggplot(viz_website_2018_03_sum, aes(x = visit_date, y = like_conversion_rate, color = condition,
linetype = pageload_delay_added,
group = interaction(condition, pageload_delay_added)
)
) +
geom_point() +
geom_line() +
geom_vline(xintercept = as.numeric(as.Date("2018-03-15"))) +
scale_y_continuous(limits = c(0, 0.3), labels = percent)
Chapter 4 - Statistical Analyses in A/B Testing
Power analyses:
Statistical tests:
Stopping rules and sequential analysis:
Multivariate testing:
A/B Testing Recap:
Example code includes:
# Run power analysis for logistic regression
total_sample_size <- powerMediation::SSizeLogisticBin(p1 = 0.17, p2 = 0.27,
B = 0.5, alpha = 0.05, power = 0.8
)
total_sample_size
# Run power analysis for t-test
sample_size <- pwr::pwr.t.test(d = 0.3, sig.level = 0.05, power = 0.8)
sample_size
# Run logistic regression
ab_experiment_results <- glm(clicked_like ~ condition, family = "binomial", data = viz_website_2018_04) %>%
broom::tidy()
ab_experiment_results
# Run t-test
ab_experiment_results <- t.test(time_spent_homepage_sec ~ condition, data = viz_website_2018_04)
ab_experiment_results
# Run sequential analysis
seq_analysis_3looks <- gsDesign::gsDesign(k = 3, test.type = 1,
alpha = 0.05, beta = 0.2, sfu = "Pocock"
)
seq_analysis_3looks
# Fill in max number of points and compute points per group and find stopping points
max_n <- 3000
max_n_per_group <- max_n / 2
stopping_points <- max_n_per_group * seq_analysis_3looks$timing
stopping_points
# Compute summary values for four conditions
viz_website_2018_05_sum <- viz_website_2018_05 %>%
group_by(word_one, word_two) %>%
summarize(mean_time_spent_homepage_sec = mean(time_spent_homepage_sec))
# Plot summary values for four conditions
ggplot(viz_website_2018_05_sum, aes(x = word_one, y = mean_time_spent_homepage_sec, fill = word_two)) +
geom_bar(stat = "identity", position = "dodge")
# Compute summary values for four conditions
viz_website_2018_05_sum <- viz_website_2018_05 %>%
group_by(word_one, word_two) %>%
summarize(like_conversion_rate = mean(clicked_like))
# Plot summary values for four conditions
ggplot(viz_website_2018_05_sum, aes(x = word_one, y = like_conversion_rate, fill = word_two)) +
geom_bar(stat = "identity", position = "dodge") +
scale_y_continuous(limits = c(0, 1), labels = percent)
# Organize variables and run logistic regression
viz_website_2018_05_like_results <- viz_website_2018_05 %>%
mutate(word_one = factor(word_one, levels = c("tips", "tools"))) %>%
mutate(word_two = factor(word_two, levels = c("better", "amazing"))) %>%
glm(clicked_like ~ word_one * word_two, family = "binomial", data = .) %>%
broom::tidy()
viz_website_2018_05_like_results
Chapter 1 - Introduction to Mixture Models
Introduction to Model-Based Clustering:
Gaussian Distribution:
Gaussian Mixture Models (GMM):
Example code includes:
gender <- readr::read_csv("./RInputFiles/gender.csv")
## Parsed with column specification:
## cols(
## Gender = col_character(),
## Height = col_double(),
## Weight = col_double(),
## BMI = col_double(),
## probability = col_double()
## )
glimpse(gender)
## Observations: 10,000
## Variables: 5
## $ Gender <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Male",...
## $ Height <dbl> 73.84702, 68.78190, 74.11011, 71.73098, 69.88180, 67.25...
## $ Weight <dbl> 241.8936, 162.3105, 212.7409, 220.0425, 206.3498, 152.2...
## $ BMI <dbl> 0.04435662, 0.03430822, 0.03873433, 0.04276545, 0.04225...
## $ probability <dbl> 5.778312e-06, 6.059525e-01, 2.625952e-05, 3.628734e-04,...
# Have a look to gender (before clustering)
head(gender)
## # A tibble: 6 x 5
## Gender Height Weight BMI probability
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Male 73.8 242. 0.0444 0.00000578
## 2 Male 68.8 162. 0.0343 0.606
## 3 Male 74.1 213. 0.0387 0.0000263
## 4 Male 71.7 220. 0.0428 0.000363
## 5 Male 69.9 206. 0.0423 0.00461
## 6 Male 67.3 152. 0.0337 0.911
# Scatterplot with probabilities
gender %>%
ggplot(aes(x = Weight, y = BMI, col = probability))+
geom_point(alpha = 0.5)
# Set seed
set.seed(1313)
# Simulate a Gaussian distribution
simulation <- rnorm(n = 500, mean = 5, sd = 4)
# Check first six values
head(simulation)
## [1] 2.618374 8.719739 10.469360 11.462134 6.165605 7.497809
# Estimation of the mean
mean_estimate <- mean(simulation)
mean_estimate
## [1] 5.324427
# Estimation of the standard deviation
standard_deviation_estimate <- sd(simulation)
standard_deviation_estimate
## [1] 3.769612
# Transform the results to a data frame
simulation <- data.frame(x = simulation)
# Plot the sample with the estimated curve
ggplot(simulation) +
geom_histogram(aes(x = x, y = ..density..)) +
stat_function(geom = "line", fun = dnorm,
args = list(mean = mean_estimate,
sd = standard_deviation_estimate))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Estimation of the mean
mean_estimate <- gender %>%
pull(Weight) %>%
mean()
mean_estimate
## [1] 161.4404
# Estimation of the standard deviation
sd_estimate <- gender %>%
pull(Weight) %>%
sd()
sd_estimate
## [1] 32.10844
# Plot the sample with the estimated curve
gender %>%
ggplot() +
geom_histogram(aes(x = Weight, y = ..density..), bins = 100) +
stat_function(geom = "line", fun = dnorm,
args = list(mean = mean_estimate, sd = sd_estimate))
# Create coin object
coin <- sample(c(0, 1), size = 500, replace = TRUE, prob = c(0.2, 0.8))
# Sample from two different Gaussian distributions
mixture <- ifelse(coin == 1, rnorm(n = 500, mean = 5, sd = 2), rnorm(n = 500))
# Check the first elements
head(mixture)
## [1] 6.715330 5.157209 2.158443 5.133819 6.089409 1.007223
# Transform into a data frame
mixture <- data.frame(x = mixture)
# Create histogram especifiying that is a density plot
mixture %>% ggplot() +
geom_histogram(aes(x = x, y = ..density..), bins = 50)
number_observations <- 1000
# Create the assignment object
assignments <- sample(c(0, 1 , 2), size = number_observations, replace = TRUE, prob = c(0.3, 0.4, 0.3))
# Simulate the GMM with 3 distributions
mixture <- data.frame(
x = ifelse(assignments == 1, rnorm(n = number_observations, mean = 5, sd = 2),
ifelse(assignments == 2,
rnorm(n = number_observations, mean = 10, sd = 1),
rnorm(n = number_observations)
)
)
)
# Plot the mixture
mixture %>%
ggplot() +
geom_histogram(aes(x = x, y = ..density..), bins = 50)
Chapter 2 - Structure of Mixture Models and Parameter Estimation
Structure of Mixture Models:
Parameter Estimation:
select(x, prob_red, prob_blue) %>% head() EM Algorithm:
data <- data %>% mutate(prob_from_red = proportions[1] * dnorm(x, mean = means[1]), prob_from_blue = proportions[2] * dnorm(x, mean = means[2]), prob_red = prob_from_red/(prob_from_red + prob_from_blue), prob_blue = prob_from_blue/(prob_from_red + prob_from_blue) ) %>% select(x, prob_red, prob_blue) return(data) new_values <- maximization(expectation(data, means_init, props_init)) means_init <- new_values[[1]] props_init <- new_values[[2]] cat(c(i, means_init, proportions_init),"\n") Example code includes:
digits <- readr::read_csv("./RInputFiles/digits.csv")
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
dim(digits)
## [1] 1593 266
digitData <- digits[, 1:256]
digitKey <- digits[, 257:266]
# keep a subset of 4 and 8
digitUse <- rowSums(digitKey[, c(5, 9)]==1)
digData <- digitData[digitUse, ]
digKey <- digitKey[digitUse, ]
show_digit <- function(arr256, col=gray(4:1/4), ...) {
arr256 <- as.numeric(arr256)
image(matrix(arr256, nrow=16)[,16:1],col=col,...)
}
# Dimension
# broom::glance(digits)
# Apply `glimpse` to the data
glimpse(digitData)
## Observations: 1,593
## Variables: 256
## $ V1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ V4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,...
## $ V5 <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,...
## $ V6 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0,...
## $ V7 <dbl> 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,...
## $ V8 <dbl> 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,...
## $ V9 <dbl> 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,...
## $ V10 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,...
## $ V11 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,...
## $ V12 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,...
## $ V13 <dbl> 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,...
## $ V14 <dbl> 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,...
## $ V15 <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,...
## $ V16 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V17 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V18 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ V19 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,...
## $ V20 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,...
## $ V21 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,...
## $ V22 <dbl> 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0,...
## $ V23 <dbl> 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,...
## $ V24 <dbl> 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,...
## $ V25 <dbl> 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,...
## $ V26 <dbl> 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,...
## $ V27 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,...
## $ V28 <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,...
## $ V29 <dbl> 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ V30 <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,...
## $ V31 <dbl> 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,...
## $ V32 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V33 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ V34 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,...
## $ V35 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,...
## $ V36 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,...
## $ V37 <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0,...
## $ V38 <dbl> 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,...
## $ V39 <dbl> 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,...
## $ V40 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,...
## $ V41 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ V42 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ V43 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ V44 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0,...
## $ V45 <dbl> 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,...
## $ V46 <dbl> 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,...
## $ V47 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,...
## $ V48 <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,...
## $ V49 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,...
## $ V50 <dbl> 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,...
## $ V51 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,...
## $ V52 <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0,...
## $ V53 <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V54 <dbl> 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V55 <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V56 <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V57 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V58 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V59 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V60 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ V61 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1,...
## $ V62 <dbl> 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,...
## $ V63 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1,...
## $ V64 <dbl> 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,...
## $ V65 <dbl> 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,...
## $ V66 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,...
## $ V67 <dbl> 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,...
## $ V68 <dbl> 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,...
## $ V69 <dbl> 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V70 <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V71 <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V72 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V73 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V74 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V75 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V76 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V77 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,...
## $ V78 <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,...
## $ V79 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ V80 <dbl> 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,...
## $ V81 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,...
## $ V82 <dbl> 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,...
## $ V83 <dbl> 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0,...
## $ V84 <dbl> 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V85 <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V86 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V87 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V88 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V89 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V90 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V91 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V92 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V93 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V94 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,...
## $ V95 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ V96 <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,...
## $ V97 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,...
## $ V98 <dbl> 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,...
## $ V99 <dbl> 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,...
## $ V100 <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V101 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V102 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V103 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V104 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V105 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V106 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V107 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V108 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V109 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ V110 <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,...
## $ V111 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,...
## $ V112 <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,...
## $ V113 <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,...
## $ V114 <dbl> 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,...
## $ V115 <dbl> 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,...
## $ V116 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V117 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V118 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V119 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V120 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V121 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V122 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V123 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V124 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ V125 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ V126 <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,...
## $ V127 <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,...
## $ V128 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,...
## $ V129 <dbl> 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,...
## $ V130 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,...
## $ V131 <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,...
## $ V132 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V133 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V134 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V135 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V136 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V137 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V138 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V139 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V140 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ V141 <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ V142 <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V143 <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,...
## $ V144 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,...
## $ V145 <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,...
## $ V146 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,...
## $ V147 <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,...
## $ V148 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V149 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V150 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V151 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V152 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V153 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V154 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ V155 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ V156 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ V157 <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,...
## $ V158 <dbl> 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V159 <dbl> 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,...
## $ V160 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,...
## $ V161 <dbl> 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,...
## $ V162 <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0,...
## $ V163 <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,...
## $ V164 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,...
## $ V165 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,...
## $ V166 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V167 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V168 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V169 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V170 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ V171 <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ V172 <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,...
## $ V173 <dbl> 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V174 <dbl> 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0,...
## $ V175 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,...
## $ V176 <dbl> 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,...
## $ V177 <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0,...
## $ V178 <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0,...
## $ V179 <dbl> 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,...
## $ V180 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,...
## $ V181 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ V182 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V183 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V184 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V185 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ V186 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ V187 <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,...
## $ V188 <dbl> 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,...
## $ V189 <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,...
## $ V190 <dbl> 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,...
## $ V191 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,...
## $ V192 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ V193 <dbl> 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0,...
## $ V194 <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0,...
## $ V195 <dbl> 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,...
## $ V196 <dbl> 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,...
## $ V197 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,...
## $ V198 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V199 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ V200 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ V201 <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,...
## $ V202 <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,...
## $ V203 <dbl> 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,...
## $ V204 <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,...
## $ V205 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,...
## $ V206 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,...
## $ V207 <dbl> 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,...
## $ V208 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V209 <dbl> 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,...
## $ V210 <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0,...
## $ V211 <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,...
## $ V212 <dbl> 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ V213 <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,...
## $ V214 <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,...
## $ V215 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ V216 <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,...
## $ V217 <dbl> 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,...
## $ V218 <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,...
## $ V219 <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,...
## $ V220 <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,...
## $ V221 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,...
## $ V222 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,...
## $ V223 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ V224 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V225 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V226 <dbl> 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,...
## $ V227 <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,...
## $ V228 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,...
## $ V229 <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ V230 <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,...
## $ V231 <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,...
## $ V232 <dbl> 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,...
## $ V233 <dbl> 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,...
## $ V234 <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,...
## $ V235 <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,...
## $ V236 <dbl> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1,...
## $ V237 <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,...
## $ V238 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V239 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V240 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V241 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V242 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V243 <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ V244 <dbl> 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,...
## $ V245 <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,...
## $ V246 <dbl> 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,...
## $ V247 <dbl> 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,...
## $ V248 <dbl> 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,...
## $ V249 <dbl> 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,...
## $ V250 <dbl> 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,...
## $ V251 <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,...
## $ V252 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,...
## $ V253 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V254 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V255 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ V256 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
# Digit in row 50
show_digit(digitData[50, ])
# Digit in row 100
show_digit(digitData[100, ])
gaussian_sample_with_probs <- data.frame(
x = c(54.5, 7.7, 55.9, 27.9, 4.6, 59.9, 6.4, 60.5, 32.6, 21.3, 0.5, 8.9, 70.7, 49.3, 40.1, 43, 8.1, 62.9, 56, 54.4, 42.5, 46.1, 58.3, 61.7, -11.6, 10.8, 27.5, 12.2, 67.7, -5.6, 13.3, 62.7, 37.2, 41.4, 47.4, 54.2, 31, 60.2, 69.9, 33.8, 25.4, 21.9, 17.9, 61.5, 49.8, 37.9, 55.8, 14.1, 53.3, 45.6, 44.7, 14.2, -5.7, 10.9, 63.7, -6.5, 50.3, 61.4, 35.1, -3.7, 68.4, -6.2, 64, 24.4, 65.7, 59.7, 52.7, 27.2, 17.5, 22.6, 14.7, 22.1, 61.5, 55.6, 62.6, 5.6, 52.3, 8, 25.4, 48.8, 58.4, 6.2, 52.3, 6.6, 64, 43, 60.6, 33.5, 45.8, 2.5, 63, 58.2, 50.9, 22.1, 36.5, 27.1, 61.4, 56.3, 63.5, 55.6, 53.8, 31.9, 30.7, 15.6, 14.8, 44.4, 51.9, 61.4, 11.8, 51.3, 58.6, 45.4, 8.3, 41.5, 52.7, 9.1, 60.8, 40.2, 20.5, 40.2, 59.2, 36.7, 47.5, 12.2, 7.7, 56.2, -13.2, 6, 58.7, 43.7, 67.3, 53.6, 37.6, 54.3, 37.7, 51.9, 10.5, 42, 24, -0.7, 53.1, 27.4, 57.2, 37.3, 28.6, 13.5, 35.2, 22.7, 35.8, 66.9, 45.9, 45.9, 56.7, 55.6, 58.3, 3.2, 45.9, 59.5, 50.8, 43.7, 42.8, 4.7, 29.5, 50.9, 7.8, 44.3, 53.6, 57, 57.8, 47.3, 56.8, 51.1, 27.7, 44.9, 33, 44, 42.1, 38, 52.3, 44, 28.1, 52.7, 53.6, 4.7, 42.1, 40.8, 5, 8, 49.1, 67.5, 16.2, 11.2, 14.6, 32.8, 61.3, 49.8, 51.5, 54.5, 51.6, 45.8, 55.9, 7.4, -10.2, 41.9, 27.4, 45.1, 17.7, 37.5, 53.5, 25.7, 18.1, 13.4, 40.5, 13.3, 2, 49.8, 66.7, 34.7, 11.4, 42.1, 54.4, 48.3, 38.3, 17.4, 48.2, 48.4, 57.4, 54.5, 13.6, 52.3, -0.1, 12.8, 29.3, 45.6, 62.3, 49.2, 32.6, 38.4, 15, 6.1, 12.2, 5.8, 17.7, 20.7, 43.6, 52.3, 42.4, 64.6, 34.3, 9.5, 3.6, 37.2, 45.7, 56.9, 67, 48.7, -3.1, 50.1, 45.4, 54.4, 38.1, 10.8, 7.4, 50.5, 24.7, 11.4, 59.5, 43.9, 4.4, 53.7, 41.9, 60.2, 49.5, 11.6, 51.1, 69.1, 46.2, 35.5, 15, -6.4, 59.9, 57.3, 49.1, 55.5, 55.6, 43.9, 52.5, 46.4, 5.8, 55.3, 22.2, 42.7, 51.3, 40.1, 62.1, 62.2, 48.8, 6.1, 0.6, 19.6, 36.8, 48, 33.8, 52.8, 66.6, 30.2, 45.9, 5.9, 52.7, 49.7, 37.7, 10.4, 60.1, 35.8, 62.1, 35, 38.7, 13.3, -4.9, 30.6, 55.9, 23.7, 12.6, 45.7, 38.1, 9.9, 39.6, 46.3, -3.5, 31.2, 8.3, -8.1, 31.4, 65.7, 10.7, 5.5, 54.4, 51.8, 59.8, 50.3, 45.1, 8.5, 15.3, 3.2, 19.3, 40.8, 48.4, 30.1, 32.7, 12.7, 59.2, 51.4, 55.3, 58.9, -19, 61.9, 30.3, 77.2, 39.8, 31.3, 23.1, 56, 41.9, 0.5, 33.4, 36.6, 54.4, 12.4, 16.4, 24.4, -2.4, 30.9, 56.4, 12.5, 65.2, 10, -1.7, 45.7, 49.5, 45.3, 17.5, 29, -8.7, 51.7, 17.3, 20.2, 14.6, 47.6, 55.3, 50.2, 4.1, 47.5, 71, 13.2, 75.4, 6.2, 53, 54.2, 40.6, 55.1, 67.4, 45, 47.3, 44.2, 8.4, 46.1, 48.7, 8.3, 40.4, 63, 49, 2.8, 50.4, 17.7, 40.4, 41.1, 56.6, 37.3, -0.1, 62.5, 47.7, 62.1, 16.6, 33.3, 4.1, 61, 49.4, 44.1, 18.7, -1.3, 42.1, -11.8, 40.6, 45.6, 14.9, 51.9, 57.4, 41.3, 59.2, 58.6, 50.5, -3.9, -0.6, 11.5, 54.5, 57.1, 46.2, 51.9, 58.2, 51.6, 50.3, 64.2, 8.3, 49, 42, 43.7, 53.4, 6.5, 36.6, -18.2, 41.8, -6.8, 35, 46.8, 43.8, 60.6, -11.3, 18.5, 0.3, 40.2, 73.3, 58.2, 43.9, 22.2, 12.8, 6.7, 36.3, 51.8, 33.6, 71, 56.8, 26, 43.3, 37.4, 60, 17.2, -10.3, 43.9, 69, 38.7, 57.9, 40.2, 48.6, 57.7, 45.8, 56.2, 7.3, 32.1, 41.2, 39.1),
prob_cluster1=c(0, 1, 0, 0.552, 1, 0, 1, 0, 0.158, 0.947, 1, 1, 0, 0, 0.01, 0.003, 1, 0, 0, 0, 0.004, 0.001, 0, 0, 1, 0.999, 0.591, 0.999, 0, 1, 0.998, 0, 0.03, 0.006, 0.001, 0, 0.268, 0, 0, 0.107, 0.773, 0.933, 0.985, 0, 0, 0.023, 0, 0.997, 0, 0.001, 0.002, 0.997, 1, 0.999, 0, 1, 0, 0, 0.065, 1, 0, 1, 0, 0.834, 0, 0, 0, 0.626, 0.988, 0.912, 0.996, 0.928, 0, 0, 0, 1, 0, 1, 0.773, 0, 0, 1, 0, 1, 0, 0.003, 0, 0.118, 0.001, 1, 0, 0, 0, 0.926, 0.038, 0.631, 0, 0, 0, 0, 0, 0.201, 0.286, 0.994, 0.996, 0.002, 0, 0, 0.999, 0, 0, 0.001, 1, 0.005, 0, 1, 0, 0.009, 0.961, 0.009, 0, 0.036, 0, 0.999, 1, 0, 1, 1, 0, 0.002, 0, 0, 0.025, 0, 0.024, 0, 0.999, 0.004, 0.855, 1, 0, 0.604, 0, 0.028, 0.484, 0.997, 0.062, 0.909, 0.05, 0, 0.001, 0.001, 0, 0, 0, 1, 0.001, 0, 0, 0.002, 0.003, 1, 0.398, 0, 1, 0.002, 0, 0, 0, 0.001, 0, 0, 0.576, 0.001, 0.138, 0.002, 0.004, 0.022, 0, 0.002, 0.533, 0, 0, 1, 0.004, 0.007, 1, 1, 0, 0, 0.993, 0.999, 0.996, 0.147, 0, 0, 0, 0, 0, 0.001, 0, 1, 1, 0.005, 0.606, 0.001, 0.987, 0.026, 0, 0.754, 0.985, 0.998, 0.008, 0.998, 1, 0, 0, 0.076, 0.999, 0.004, 0, 0, 0.019, 0.988, 0, 0, 0, 0, 0.997, 0, 1, 0.998, 0.417, 0.001, 0, 0, 0.162, 0.018, 0.995, 1, 0.998, 1, 0.987, 0.956, 0.002, 0, 0.004, 0, 0.087, 0.999, 1, 0.03, 0.001, 0, 0, 0, 1, 0, 0.001, 0, 0.021, 0.999, 1, 0, 0.82, 0.999, 0, 0.002, 1, 0, 0.005, 0, 0, 0.999, 0, 0, 0.001, 0.057, 0.995, 1, 0, 0, 0, 0, 0, 0.002, 0, 0.001, 1, 0, 0.925, 0.003, 0, 0.009, 0, 0, 0, 1, 1, 0.971, 0.035, 0, 0.104, 0, 0, 0.329, 0.001, 1, 0, 0, 0.024, 0.999, 0, 0.05, 0, 0.067, 0.016, 0.998, 1, 0.298, 0, 0.87, 0.998, 0.001, 0.021, 0.999, 0.012, 0.001, 1, 0.247, 1, 1, 0.233, 0, 0.999, 1, 0, 0, 0, 0, 0.001, 1, 0.995, 1, 0.975, 0.007, 0, 0.34, 0.157, 0.998, 0, 0, 0, 0, 1, 0, 0.325, 0, 0.011, 0.246, 0.895, 0, 0.005, 1, 0.12, 0.037, 0, 0.998, 0.992, 0.834, 1, 0.273, 0, 0.998, 0, 0.999, 1, 0.001, 0, 0.001, 0.987, 0.446, 1, 0, 0.989, 0.964, 0.996, 0, 0, 0, 1, 0, 0, 0.998, 0, 1, 0, 0, 0.008, 0, 0, 0.001, 0.001, 0.002, 1, 0.001, 0, 1, 0.008, 0, 0, 1, 0, 0.986, 0.008, 0.006, 0, 0.028, 1, 0, 0, 0, 0.991, 0.128, 1, 0, 0, 0.002, 0.98, 1, 0.004, 1, 0.008, 0.001, 0.996, 0, 0, 0.006, 0, 0, 0, 1, 1, 0.999, 0, 0, 0.001, 0, 0, 0, 0, 0, 1, 0, 0.004, 0.002, 0, 1, 0.037, 1, 0.005, 1, 0.068, 0.001, 0.002, 0, 1, 0.981, 1, 0.009, 0, 0, 0.002, 0.925, 0.998, 1, 0.042, 0, 0.114, 0, 0, 0.725, 0.003, 0.027, 0, 0.989, 1, 0.002, 0, 0.016, 0, 0.009, 0, 0, 0.001, 0, 1, 0.189, 0.006, 0.014)
)
gaussian_sample_with_probs <- gaussian_sample_with_probs %>%
mutate(prob_cluster2 = 1-prob_cluster1)
glimpse(gaussian_sample_with_probs)
## Observations: 500
## Variables: 3
## $ x <dbl> 54.5, 7.7, 55.9, 27.9, 4.6, 59.9, 6.4, 60.5, 32.6, 21...
## $ prob_cluster1 <dbl> 0.000, 1.000, 0.000, 0.552, 1.000, 0.000, 1.000, 0.00...
## $ prob_cluster2 <dbl> 1.000, 0.000, 1.000, 0.448, 0.000, 1.000, 0.000, 1.00...
# Estimation of the means
means_estimates <- gaussian_sample_with_probs %>%
summarise(mean_cluster1= sum(x*prob_cluster1)/sum(prob_cluster1),
mean_cluster2 = sum(x*prob_cluster2)/sum(prob_cluster2)
)
means_estimates
## mean_cluster1 mean_cluster2
## 1 10.39535 49.46501
# Estimation of the proportions
props_estimates <- gaussian_sample_with_probs %>%
summarise(props_cluster1 = mean(prob_cluster1),
props_cluster2 = mean(prob_cluster2)
)
props_estimates
## props_cluster1 props_cluster2
## 1 0.33148 0.66852
# Transform to a vector
means_estimates <- as.numeric(means_estimates)
# Plot histogram with means estimates
ggplot(gaussian_sample_with_probs) + geom_histogram(aes(x = x), bins = 100) +
geom_vline(xintercept = means_estimates)
gaussian_sample <- data.frame(
x=c(6.4, 5.9, 57.8, 52.6, 54.3, 52.3, 4.4, 49.1, -4, 12.7, 19.8, 51.8, 35.4, 17.1, 38.8, 44.1, 45.6, 7.9, 57.7, 51.1, 14.1, 36.6, 51.6, 4.1, -1.8, 55.1, 52.4, 54.4, 47.9, 36.6, 53.9, 15, 68.8, 8.3, 40.8, 39.3, 37.1, 12.7, 54.6, 34.1, 24.9, 58.5, 50.8, 48.6, 60, 52.1, 61.5, 6.9, 63, 63.5, 54.1, 37.7, 52.6, 49.1, 53.7, 13.4, 23.6, 45.5, 33.4, 46.4, 46.6, 56.1, 37.8, 44.1, 62.4, 12, 54.4, 31.6, -1, 9.4, 16, 53.4, 71.1, 8.9, 64.4, 55.9, 50.5, 57.2, 45.9, 18.5, 53.9, 12.5, 12.2, 1.5, 0.3, 40.1, 13.9, 53.2, 12.1, 57.2, 2.3, -2.6, 2.7, 59.6, 3, 10.3, 66.9, 57.3, 57.6, 9.1, 43.8, 51.1, 7.7, 13.4, 46.3, 57.5, 0.2, 1.9, 43.8, 53.9, 9.3, 45.5, 15.4, -3.2, -1.2, 40.5, 1.9, 14.5, -2, 3.4, 54.1, 2.9, 58.2, 49.5, 49.1, 60.2, 45.3, 59.7, 38, 22.4, 42.6, 53.6, 7.3, 43.9, 2.8, 66.5, 56.5, 44.4, 53.5, 40.6, 57.1, 43.8, -3.1, 47.3, 42.5, 50.8, -12, -12, 15.2, 43.8, 57.3, 32.2, 61.1, 15.1, 5.8, 24.7, 51.5, 7.7, -5.1, 63.1, 50.1, 39.9, 38.7, -5.2, 50.3, 49.1, 58.1, 31.3, 54.6, 39.1, 4.4, 60.5, 45.6, 59.7, 39.5, 60.6, 42.8, 49.5, 12.9, 47.2, 50, 11.4, 50.9, 57.3, 46.7, 35.6, 38.8, 56, -5.7, 50.5, 21.2, 45.9, 60.7, 22.1, 46.7, 12.5, 55.2, 48.4, 36.6, 54, 47, 50.3, 51.7, 11, 56, 42.4, 61.8, 45.6, 60.5, 40.6, 8.8, 21, 5.6, 68.2, 21.3, 11.5, 47.2, 26.4, 35.8, 25.4, 19.6, 56, 9.1, 63.4, 48.5, 3.2, 57.1, 52.7, 11.3, 16.3, 49, 46.5, 12.4, 9.6, 45.5, 55.3, 72.9, 8.1, -3.8, 53.8, 34.1, 45.7, 56.3, 44, 23.4, 57.2, 0.5, 33.2, 63.4, 37.3, 57.3, 52.7, 9.7, 51.9, 39.4, 63.7, 23.3, 39.9, -0.5, 41.6, 11.3, 48, 38.2, 54.2, 41.3, 30.6, 55.2, 48.9, 34.4, 16.2, 45.7, 10.1, 42.7, 12.2, 39.5, 14.1, 64.9, 53.1, 50.4, 47, 58.5, 50.8, 43.9, 56.8, 12.6, 44.5, 54.6, 8.9, 15.5, 50.2, 4.8, 52.8, 14.4, 33.7, 5.4, -0.2, 19.8, 51, 59.4, -8.2, 10.4, 47.8, 31.2, 41.4, 9.4, -3.2, 21.1, 44.7, 22.9, 11.5, 49.6, 26.7, 11.5, 35.2, 9.4, 44.8, 63.1, 8.5, 21, 30.9, 16.1, 54.4, 53.4, 9.7, 49.8, 45.6, -3, 53, 43.4, 43.4, 43.9, 56.6, 33.5, 55.1, 54.4, 62.8, 37.9, 35.1, 8.6, 7.1, 46.1, 6.1, 27, -9.9, 6.4, 44.6, 49, 46, 42.4, 9.5, 47.1, 51.3, -4.7, 14, 64.8, 38, 33.6, -0.4, 53.5, 40.3, 47.2, 58.5, 45.4, 2.5, 52.9, 47.4, 56.1, 17.7, 3.9, 30.7, 44.6, 42.4, 55.4, 47.1, 11.5, 50.7, 47.6, 11.3, 45.1, 44.2, 46.6, 36.9, 47.4, 54.6, -2, 50.7, 63.6, 58.9, 7.6, -3.1, 31.1, 44.9, 55.7, 16.6, 64.3, 27.1, 23, 48.7, -0.8, 23.6, 72.8, 11.9, 57.3, 25.4, 47.1, 9.4, 57.6, 39.6, 25.3, 31.2, 52.4, 51.1, 1.6, 76.5, 50.7, 34.2, 7.6, 25.4, 11.7, 53.5, 17.5, 53.7, 61.2, 49.9, 48.8, 40.8, 61.2, 16.4, 48.6, 7.5, -2, 64.2, 26.2, 11.2, 3.2, -4.3, 37.9, 47.7, 26.3, 58, 66.9, 59.1, 35.8, 14.2, 53, 60.3, 63.3, 53.6, 47.6, 57.1, 37, 47.6, 61.6, 52.7, 0.8, 50.5, 48.1, -3.4, 53.6, 35.7, 49.8, 2.7, 59.9, 36.5, 63.6, 53.3, 3.8, 20.2, 19.7, 20.7, 45.6, 39.8, 37.2, 38.6, 12.4, 56.3, 59.6, 10.5, 11, -6.8, 58.8, 49.5, -3.6, 51.1, 53.1, 46, 57.9, 15.2, -2.3, 22.9, 32.8, 37.6, 52, 77.5, 2.2, 9.5, 40.4, 48.5, 27.2, 37.4)
)
str(gaussian_sample)
## 'data.frame': 500 obs. of 1 variable:
## $ x: num 6.4 5.9 57.8 52.6 54.3 52.3 4.4 49.1 -4 12.7 ...
# Create data frame with probabilities
gaussian_sample_with_probs <- gaussian_sample %>%
mutate(prob_from_cluster1 = 0.35 * dnorm(x, mean = 10, sd = 10),
prob_from_cluster2 = 0.65 * dnorm(x, mean = 50, sd = 10),
prob_cluster1 = prob_from_cluster1/(prob_from_cluster1 + prob_from_cluster2),
prob_cluster2 = prob_from_cluster2/(prob_from_cluster1 + prob_from_cluster2)) %>%
select(x, prob_cluster1, prob_cluster2)
head(gaussian_sample_with_probs)
## x prob_cluster1 prob_cluster2
## 1 6.4 9.998524e-01 0.0001475847
## 2 5.9 9.998792e-01 0.0001208354
## 3 57.8 7.976210e-06 0.9999920238
## 4 52.6 6.384176e-05 0.9999361582
## 5 54.3 3.234434e-05 0.9999676557
## 6 52.3 7.198080e-05 0.9999280192
expectation <- function(data, means, proportions, sds){
# Estimate the probabilities
exp_data <- data %>%
mutate(prob_from_cluster1 = proportions[1] * dnorm(x, mean = means[1], sd = sds[1]),
prob_from_cluster2 = proportions[2] * dnorm(x, mean = means[2], sd = sds[2]),
prob_cluster1 = prob_from_cluster1/(prob_from_cluster1 + prob_from_cluster2),
prob_cluster2 = prob_from_cluster2/(prob_from_cluster1 + prob_from_cluster2)) %>%
select(x, prob_cluster1, prob_cluster2)
# Return data with probabilities
return(exp_data)
}
maximization <- function(data_with_probs){
means_estimates <- data_with_probs %>%
summarise(mean_1 = sum(x * prob_cluster1) / sum(prob_cluster1),
mean_2 = sum(x * prob_cluster2) / sum(prob_cluster2)
) %>%
as.numeric()
props_estimates <- data_with_probs %>%
summarise(proportion_1 = mean(prob_cluster1), proportion_2 = 1 - proportion_1) %>%
as.numeric()
list(means_estimates, props_estimates)
}
means_init <- c(0, 100)
props_init <- c(0.5, 0.5)
# Iterative process
for(i in 1:10){
new_values <- maximization(expectation(gaussian_sample, means_init, props_init, c(10, 10)))
means_init <- new_values[[1]]
props_init <- new_values[[2]]
cat(c(i, means_init, props_init), "\n")
}
## 1 25.28863 56.90005 0.6797875 0.3202125
## 2 20.01129 53.44814 0.539439 0.460561
## 3 14.77156 51.48322 0.4377961 0.5622039
## 4 11.62146 50.28191 0.3846544 0.6153456
## 5 10.34764 49.72052 0.363436 0.636564
## 6 9.918957 49.49888 0.355935 0.644065
## 7 9.777305 49.41932 0.3533705 0.6466295
## 8 9.730017 49.39192 0.3525025 0.6474975
## 9 9.714139 49.38262 0.3522096 0.6477904
## 10 9.708796 49.37948 0.3521109 0.6478891
fun_gaussian <- function(x, mean, proportion){
proportion * dnorm(x, mean, sd = 10)
}
means_iter10 <- means_init
props_iter10 <- props_init
gaussian_sample %>% ggplot() +
geom_histogram(aes(x = x, y = ..density..), bins = 200) +
stat_function(geom = "line", fun = fun_gaussian,
args = list(mean = means_iter10[1], proportion = props_iter10[1])
) +
stat_function(geom = "line", fun = fun_gaussian,
args = list(mean = means_iter10[2], proportion = props_iter10[2])
)
Chapter 3 - Mixture of Gaussians with flexmix
Univariate Gaussian Mixture Models:
Univariate Gaussian Mixture Models with flexmix:
Bivariate Gaussian Mixture Models with flexmix:
Bivariate Gaussian Mixture Models with flexmix:
ggplot(aes(x = Weight, y = BMI)) + geom_point() + geom_path(data = data.frame(ellipse_comp_1), aes(x=x,y=y), col = "red") + geom_path(data = data.frame(ellipse_comp_2), aes(x=x,y=y), col = "blue" Example code includes:
xExample <- c(7.3, 58.7, 9.7, 16.9, 6.3, 35.1, 33.5, 61.3, 28.3, 24.3, 58.6, 13.1, 58.7, 34, 29.1, 46.4, 54.6, 5.9, 30.6, 27.9, 27.5, -5.3, 37.6, 9.1, 44.5, 57.5, 30.5, 5, 51.9, 33.6, 37.4, 28.8, 47.9, 5.4, 64.1, 45.1, 41, 36.3, 28.2, 33.8, 9.8, 57.4, 48.4, 58.3, 27.7, 38.4, 36.4, 66.9, 30.7, 34.3, 25.9, 48.5, 52, 0.3, 45.3, 31.9, 21.6, 36.6, 29, 13.2, 41.5, 8.2, 46.6, 30.6, 48.6, 5.6, 39.3, 30.5, 34.2, 61.5, 4.2, 71.3, 42.5, 32.7, 54.4, 19.2, 13.3, 40.3, 72, 21.8, 49.5, 38.7, 9.6, 49.6, 32, 30.9, 28.6, 30.1, 29.8, 67.9, 60.8, 55, 34.6, 32.8, 11.9, 50.5, 32.1, 13.7, 48.6, 32.6, 9.1, 27.6, 35.6, 28.3, 15.1, 54.7, 30.8, 22.2, 27.5, 49.3, 56, 26.1, 57.2, 46.4, 50.3, 43.6, 51.8, 47.5, 15.5, 60.2, 63.6, 45.3, 14.1, 42.1, 31.4, 42.4, 61.7, 60.1, 27.7, 55.9, 3.3, 18.7, 58.1, 46, 14, 41.7, 28.9, 29.1, 56.9, 32.3, -0.8, 29.4, 27.3, 33.5, 39.1, 13.9, 28.7, 29.4, 10.3, 44.3, 57.1, 76, 49.4, 44.9, 23.2, 53.9, 33.6, 32.7, 30, 57, 63.6, 32.9, 8.6, 26.5, 26, 53.3, 40.8, 30.1, 10.5, 47.2, 30.2, 49.3, 52.4, 48.8, 51.4, 40.7, 33.8, 45.7, 28.1, 13.2, 28.4, 31.7, 30, 29.6, 49.5, 35, 62, 51.9, 39, 15.4, 59.1, 54.8, 9.2, 9.7, 35.4, 32.9, 31.3, 30.4, 64.4, 63.4, 32.9, 40.6, 37.5, 52.3, 35.3, 8.1, 6.4, 26.2, 29.2, 29.7, 27.8, 35.2, 34.1, 29.8, 49, 65.6, -1.1, 28.6, 33.7, 48.1, 45.7, 30.3, 32.7, 64.5, 29.8, 52.5, 48.4, 48.8, 26.4, 37.4, 33.2, 46.1, 29.5, -0.9, 49.8, 34.1, 48.9, 12.5, 36.6, 22.1, 57.3, 9.5, 9.4, 58.5, 50.2, 45.3, 25.3, 27.4, 4.5, 58.5, 63.4, 48.7, 42.6, 33, 47.9, 30.3, 54.9, 7.9, 50.2, 11.2, 59.7, 46.5, 57.5, 26.9, 28.5, 29.7, 52.5, 16.9, 29.8, 28.6, 31.2, 65.3, 1.7, 31.4, 52.5, 5.1, 66.1, 51.5, 9.5, 9.8, 41.6, 0.3, 10.4, 15.5, 34.8, 27.5, 43.6, 31.4, 46.3, 4.6, 45.8, 49.2, 10.7, 48.1, 7.3, 33.4, 10.7, 53.4, 28.9, 51.1, 52.4, 55.9, 56.8, 47.2, 46.8, 30.8, 60.3, 53.6, 30.9, 70.8, 11.2, 7.5, 55.8, 14.3, 25.8, 14.5, 30.9, 60.8, 26.8, 16.5, 31.4, 26.6, 10.6, 53.4, 33.1, 33.1, 46.3, 8.2, 56, 14.1, 25.5, 59.6, 61.9, 58.6, 63.1, 47.7, 30.5, 42.4, 56.2, 17, 13.4, 34.4, 1.1, 18.4, 63.9, 38.6, 15, 30.1, 23.9, 5.9, 53.8, 18.2, 22.7, 45.7, 29.2, 8.4, 52.5, 42, 28.7, 61.7, 35.4, 32.5, 5.5, 6.8, 60.1, 29.4, 31.5, 2.3, 28.3, 29.6, 34.9, 33.2, 28.9, 33.9, 51, 35.4, 52.3, 60, 27.1, 24.7, 57.7, 32.7, 52.5, 66.3, 37.8, 46.3, 38.1, 30.6, 55.6, 44.9, 28.4, 28.9, 19, 7.7, 9.4, 36, 49.9, 42.2, 28.2, 11.5, 52.4, 46.3, 52.4, 27.4, 15.6, 62.3, 51.7, 41.6, 6.2, 10.5, 14.7, 30.4, 23.9, 58.7, 36.1, 47.6, 31.2, 29.1, 60.1, 18, 30, 56.5, 42.7, 27.1, 45.5, 36.6, 46.4, 25.9, 15.4, 31.6, 3.3, 33.6, 63.3, 57.1, 32.3, 11.8, 32.9, 47.2, 31.2, 49.3, 61.7, 11.5, 9.7, 49.6, 45.7, 16.1, 27.4, 22.8, 8.5, 56.2, 26, 45.7, 29, 34.6, 29.4, 3.9, 45.7, 31.7, 52.6, 40.2, 35.5, 5.8, 56.4, 49.5, 30.6, 40.2, 20.8, 43.9, 32.1, 40.8, 45.6, 32.8, 7.4, 27.5, 29.4, 50.8, 43.9, 36.8, 5.5, 61.5, 41.5, 47.5, 13.9, 30.1, 67.3, 27.1, 50.8, 37.4, 28, 25, 37.1, 49.3, 25.3, 26.9, 34.9, 51.8, 33.9, 34.7, 44.2, 10.1, 71.3, 47.5, 23.4, 45.7, 49.4, 32.6, 6.9, 67.8, 56.8, 41.9, 50.7, 31.5, 55, 14.2, 34.8, 26.2, 25.8, 64, 63.8, 56.4, 42.1, 29.5, 49.4, 30.2, 16.2, 30, -0.2, 30.7, 29.6, 57, 41.5, 6.4, 9.7, 47.1, 19.4, 39.8)
xExample <- c(xExample, 32.9, 53.6, 8.4, 32.8, 63.1, 58.4, 7.5, 26, 41.8, 29, 36.9, 41.5, 39.5, 14.1, 27.4, 14.9, 48.4, 34.8, 72.8, 36.9, 27.8, 27.6, 6.1, 43.8, 36.9, 58.5, 55.1, 45.2, 2.6, 20.4, 59, 60.6, 57.7, 29.8, 60.2, 36.9, 29, 28, 46.5, 55, 29.6, 52.6, 38, 45.3, 5.7, 44.8, 35.3, 56.1, 30.3, 32.4, 56.9, 30.8, 44.8, 62.8, 46.1, 57.2, 50.5, 46.4, 37.6, 29.9, 8.6, 35.5, 47.4, 27.2, 36.4, 33.1, 29.4, 25.8, 46, 27.6, 45.7, 32.3, 12.8, 49.8, 13.7, 65.3, 48.5, 39.6, 4, 32.1, 49.6, 44, 74.5, 31, 52.6, 33.3, 56.8, 11.4, 33.7, 34.3, 25.8, 39.8, 7.3, 33.6, 7.9, 49.6, 52.6, 36.5, 43, 14.7, 43.5, 37, 50.8, 46.5, 46.9, 25.4, 32.7, 48.4, 40.3, 45.9, 51.3, 24, 48.3, 39.5, 21.2, 48.1, 56.9, 32.3, 10.2, 9.3, 40.3, 52.8, 34.5, 32.4, 30.1, 10.8, -3.8, 30.4, 58.2, 57.3, 48.9, 36.1, 46.2, 69, 67.8, 58.5, 41.9, 29.6, 51.7, 39.4, 50.8, 29.2, 56.1, 54.4, 17.2, 57.5, 54.1, 48.6, -0.9, 56.3, 27.7, 58.8, 57, 44.1, 6.3, 4.1, 35.9, 60.2, 44.1, 53.9, 33.3, 35.4, 32.1, 56, 56.8, 30.1, 43.1, 64.6, 27.7, 30.7, 53, 66, 29.1, 45, 12.3, 41.3, 54.7, 45.3, 13.3, 9.7, -2, 29.1, 29.5, 31.3, 29.2, 13.8, 26.7, 7.4, 36.8, 42.6, 54.7, 51.3, 42.6, 18, 34, 44.1, 53.6, 44.7, 28.9, 64.9, 60, 66.6, 32.9, 15.5, 37.6, 8.3, 28.5, 16.2, 39.7, 25.9, 8.8, 30.9, 9.9, 39.3, 66.4, 62.4, 53.8, 9.3, 44.7, 50.4, 57.8, 29, 50.1, 28.5, 62.9, 16.3, 54, 45.4, 60.6, 9, 7.7, 64.2, 54.4, 53.3, 45.5, 38, 5.2, 61.7, 10.8, 4.3, 24.8, 26.5, 32.2, 4.5, 49.3, 3.9, 39.6, 26.8, 36.3, 65.1, 59.6, 61.3, 30.1, 65.5, 55.8, 48.2, 49.8, 11.2, 64.2, 29, 44.6, 59.9, 12.6, 51.8, 14.5, 28.8, 49.8, 30.4, 42.7, 2.8, 31.1, 29.2, 27.4, 49.9, 28.2, 59.5, 28.7, 9.4, 30.2, 33.3, 30, 26, 65.1, 55.9, 30.5, 61.1, 50.3, 31.3, 58.2, 41.3, 33.4, 14.8, 51.2, 40.8, 34.1, 33.7, 29.4, 56, 26.4, 30.7, 55.1, 49.7, 37.7, 56.9, 38.5, 28.8, 50.3, 45.7, 13.2, 32.8, 30.5, 30.6, 61.5, 57.7, 33.6, 24.6, 53.9, 36.1, 37.4, 55.5, 27.4, 44.2, 15.4, 56.3, 28.1, 28.8, 67.6, 17.7, 48.5, 57.5, 33.7, 12.9, 19.5, 30.6, 56.8, 75.4, 26, 32.3, 28.3, 10.7, 9, 66.5, 51.6, 30.2, 46, 44.1, 53, 33.9, 28.4, 53.1, 42.3, 55.2, 42.4, 9.4, 36.3, 26.6, 41.2, 33, 42.1, 27, 25.4, 53.8, 56.7, 22.2, 29.5, 30.9, 9.3, 30.4, 48.1, 30.9, 28.4, 38.6, 28.8, 52, 16.5, 64.3, 56.1, 51.4, 50.2, 30.1, 67.3, 62.3, 12.9, 27.9, 38.9, 29.3, 17.4, 30, 62.5, 40.5, 48, 31.9, 54.7, 27.4, 28.2, 46.6, 14, 61.9, 59.4, 65.4, 30.2, 28.9, 35.4, 55.8, 51.4, 47.8, 34, 56.2, 26.5, 30.2, 8.4, 10.9, 63.9, 41.9, 31.3, 52.8, 36, 45.4, -2, 57.3, 80.3, 41, 13.8, 31.9, 33.8, 48.5, 16.7, 29.5, 6.7, 42.1, 32.2, 45.7, 18.9, 30.5, 30.9, 40.2, 14.6, 41.2, 27, 6.1, 34.9, 57.5, 30.1, 56.6, 62.4, 11.5, 25.7, 14.8, 28.2, 43.5, 37.7, 32.1, 44.4, 56.2, 7.6, 29.4, 63.4, 53, 14.6, 50.1, 62.6, 29.3, 33.5, 52.7)
mix_assign <- c(1, 2, 1, 1, 1, 2, 2, 2, 3, 3, 2, 1, 2, 2, 3, 2, 2, 1, 3, 3, 3, 1, 2, 1, 2, 2, 3, 1, 2, 3, 2, 3, 2, 1, 2, 2, 2, 2, 3, 3, 1, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 1, 2, 3, 1, 3, 3, 1, 2, 1, 2, 3, 2, 1, 2, 3, 3, 2, 1, 2, 2, 3, 2, 1, 1, 2, 2, 3, 2, 2, 1, 2, 3, 2, 2, 3, 3, 2, 2, 2, 3, 3, 1, 2, 3, 1, 2, 2, 1, 3, 2, 3, 1, 2, 3, 1, 3, 2, 2, 3, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 3, 2, 2, 2, 3, 2, 1, 1, 2, 2, 1, 2, 3, 3, 2, 3, 1, 3, 3, 2, 2, 1, 3, 3, 1, 2, 2, 2, 2, 2, 1, 2, 3, 2, 2, 2, 2, 3, 1, 3, 3, 2, 2, 3, 1, 2, 3, 2, 2, 2, 2, 2, 3, 2, 3, 1, 3, 3, 3, 3, 2, 3, 2, 2, 2, 1, 2, 2, 1, 1, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 2, 1, 1, 3, 3, 3, 3, 3, 3, 2, 2, 2, 1, 3, 3, 2, 2, 3, 3, 2, 3, 2, 2, 2, 3, 2, 3, 2, 3, 1, 2, 3, 2, 1, 3, 3, 2, 1, 1, 2, 2, 2, 2, 3, 1, 2, 2, 2, 2, 2, 2, 3, 2, 1, 2, 1, 2, 2, 2, 3, 3, 3, 2, 1, 3, 3, 2, 2, 1, 3, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 3, 3, 2, 3, 2, 1, 2, 2, 1, 2, 1, 3, 1, 2, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 3, 2, 1, 1, 2, 1, 3, 1, 3, 2, 3, 1, 3, 3, 1, 2, 3, 2, 2, 1, 2, 1, 3, 2, 2, 2, 2, 2, 3, 2, 2, 1, 1, 3, 1, 1, 2, 3, 1, 3, 3, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 2, 3, 1, 1, 2, 3, 2, 1, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2, 3, 3, 2, 3, 2, 2, 2, 2, 2, 3, 2, 2, 3, 3, 1, 1, 1, 2, 2, 2, 3, 1, 2, 2, 2, 3, 1, 2, 2, 2, 1, 1, 1, 3, 3, 2, 3, 2, 3, 2, 2, 1, 3, 2, 2, 3, 2, 3, 2, 3, 1, 3, 1, 3, 2, 2, 3, 1, 3, 2, 3, 2, 2, 1, 1, 2, 2, 1, 3, 3, 1, 2, 3, 2, 3, 2, 3, 1, 2, 2, 2, 2, 3, 1, 2, 2, 3, 2, 2, 2, 3, 2, 2, 3, 1, 3, 3, 2, 2, 2, 1, 2, 2, 2, 1, 3, 2, 3, 2, 2, 3, 3, 2, 2, 3, 3, 3, 2, 3, 3, 2, 1, 2, 2, 3, 2, 2, 3, 1, 2, 2, 2, 2, 3, 2, 1, 3, 3, 2, 2, 2, 2, 2, 3, 2, 3, 1, 3, 1, 3, 3, 2, 2, 1, 1, 2, 2, 2, 3, 2, 1, 3, 2, 2, 1, 3, 2, 3, 2, 2, 2, 1)
mix_assign <- c(mix_assign, 3, 1, 2, 3, 2, 3, 3, 3, 1, 2, 3, 2, 2, 2, 1, 2, 2, 2, 2, 3, 2, 3, 3, 3, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 3, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 3, 2, 3, 2, 3, 3, 3, 2, 3, 2, 3, 1, 2, 1, 2, 2, 2, 1, 3, 2, 2, 2, 3, 2, 3, 2, 1, 3, 2, 3, 2, 1, 2, 1, 2, 2, 3, 2, 1, 2, 3, 2, 2, 2, 3, 3, 2, 2, 2, 2, 3, 2, 2, 1, 2, 2, 3, 1, 1, 2, 2, 3, 3, 3, 1, 1, 3, 2, 2, 2, 3, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 2, 2, 1, 2, 2, 2, 1, 2, 3, 2, 2, 2, 1, 1, 3, 2, 2, 2, 3, 3, 3, 2, 2, 3, 2, 2, 3, 3, 2, 2, 3, 2, 1, 2, 2, 2, 1, 1, 1, 3, 3, 3, 3, 1, 3, 1, 2, 2, 2, 2, 2, 1, 3, 2, 2, 2, 3, 2, 2, 2, 3, 1, 2, 1, 3, 1, 2, 3, 1, 3, 1, 3, 2, 2, 2, 1, 2, 2, 2, 3, 2, 3, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 3, 3, 3, 1, 2, 1, 2, 3, 3, 2, 2, 2, 3, 2, 2, 2, 2, 1, 2, 3, 2, 2, 1, 2, 1, 3, 2, 3, 2, 1, 3, 3, 3, 2, 3, 2, 3, 1, 2, 2, 3, 3, 2, 2, 3, 2, 2, 3, 2, 2, 3, 1, 2, 2, 3, 3, 3, 2, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 1, 3, 3, 3, 2, 2, 3, 3, 2, 2, 2, 2, 3, 2, 1, 2, 3, 3, 2, 1, 2, 2, 2, 1, 1, 3, 2, 2, 3, 3, 3, 1, 1, 2, 2, 3, 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 3, 3, 2, 3, 2, 3, 3, 2, 2, 2, 3, 2, 1, 3, 2, 3, 3, 2, 3, 2, 1, 2, 2, 2, 2, 3, 2, 2, 1, 3, 2, 2, 1, 3, 2, 2, 2, 3, 2, 3, 3, 2, 1, 2, 2, 2, 3, 3, 2, 2, 2, 2, 2, 2, 3, 3, 1, 1, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, 1, 3, 3, 2, 1, 3, 1, 2, 3, 2, 1, 2, 3, 2, 1, 2, 3, 1, 3, 2, 3, 2, 2, 1, 3, 1, 3, 2, 2, 3, 2, 2, 1, 3, 2, 2, 2, 2, 2, 3, 3, 2)
mix_example <- data.frame(x=xExample, assignment=mix_assign)
str(mix_example)
## 'data.frame': 1000 obs. of 2 variables:
## $ x : num 7.3 58.7 9.7 16.9 6.3 35.1 33.5 61.3 28.3 24.3 ...
## $ assignment: num 1 2 1 1 1 2 2 2 3 3 ...
library(flexmix)
## Loading required package: lattice
set.seed(1515)
# If wanting verbose output
# control = list(tolerance = 1e-15, verbose = 1, iter = 1e4)
fit_mix_example <- flexmix(x ~ 1, data = mix_example, k = 3, model = FLXMCnorm1(),
control = list(tolerance = 1e-15, iter = 1e4)
)
proportions <- prior(fit_mix_example)
comp_1 <- parameters(fit_mix_example, component = 1)
comp_2 <- parameters(fit_mix_example, component = 2)
comp_3 <- parameters(fit_mix_example, component = 3)
fun_prop <- function(x, mean, sd, proportion){
proportion * dnorm(x = x, mean = mean, sd = sd)
}
ggplot(mix_example) +
geom_histogram(aes(x = x, y = ..density..)) +
stat_function(geom = "line", fun = fun_prop,
args = list(mean = comp_1[1], sd = comp_1[2], proportion = proportions[1])
) +
stat_function(geom = "line", fun = fun_prop,
args = list(mean = comp_2[1], sd = comp_2[2], proportion = proportions[2])
) +
stat_function(geom = "line", fun = fun_prop,
args = list(mean = comp_3[1], sd = comp_3[2], proportion = proportions[3])
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Explore the first assignments
head(clusters(fit_mix_example))
## [1] 2 1 2 2 2 3
# Explore the first real labels
head(mix_example$assignment)
## [1] 1 2 1 1 1 2
# Create frequency table
table(mix_example$assignment, clusters(fit_mix_example))
##
## 1 2 3
## 1 0 184 1
## 2 464 5 37
## 3 18 2 289
genderData <- readr::read_csv("./RInputFiles/gender.csv")
## Parsed with column specification:
## cols(
## Gender = col_character(),
## Height = col_double(),
## Weight = col_double(),
## BMI = col_double(),
## probability = col_double()
## )
str(genderData)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 10000 obs. of 5 variables:
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Height : num 73.8 68.8 74.1 71.7 69.9 ...
## $ Weight : num 242 162 213 220 206 ...
## $ BMI : num 0.0444 0.0343 0.0387 0.0428 0.0423 ...
## $ probability: num 5.78e-06 6.06e-01 2.63e-05 3.63e-04 4.61e-03 ...
## - attr(*, "spec")=
## .. cols(
## .. Gender = col_character(),
## .. Height = col_double(),
## .. Weight = col_double(),
## .. BMI = col_double(),
## .. probability = col_double()
## .. )
set.seed(1313)
fit_with_covariance <- flexmix(cbind(Weight, BMI) ~ 1, data = genderData, k = 2,
model = FLXMCmvnorm(diag = FALSE),
control = list(tolerance = 1e-15, iter.max = 1000)
)
# Get the parameters
comp_1 <- parameters(fit_with_covariance, component = 1)
comp_2 <- parameters(fit_with_covariance, component = 2)
# The means
mean_comp_1 <- comp_1[1:2]
mean_comp_1
## [1] 135.97738684 0.03334097
mean_comp_2 <- comp_2[1:2]
mean_comp_2
## [1] 186.6849545 0.0391035
# The covariance matrices
covariance_comp_1 <- matrix(comp_1[3:6], nrow = 2)
covariance_comp_1
## [,1] [,2]
## [1,] 370.85097459 4.712215e-02
## [2,] 0.04712215 8.103393e-06
covariance_comp_2 <- matrix(comp_2[3:6], nrow = 2)
covariance_comp_2
## [,1] [,2]
## [1,] 405.22840544 2.742036e-02
## [2,] 0.02742036 4.668417e-06
# Create ellipse curve 1
ellipse_comp_1 <- ellipse::ellipse(x = covariance_comp_1, centre = mean_comp_1, npoints = nrow(genderData))
head(ellipse_comp_1)
## x y
## [1,] 181.4301 0.04005980
## [2,] 181.4223 0.04006096
## [3,] 181.4144 0.04006212
## [4,] 181.4065 0.04006327
## [5,] 181.3986 0.04006442
## [6,] 181.3906 0.04006557
# Create ellipse curve 2
ellipse_comp_2 <- ellipse::ellipse(x = covariance_comp_2, centre = mean_comp_2, npoints = nrow(genderData))
head(ellipse_comp_2)
## x y
## [1,] 231.1740 0.04387866
## [2,] 231.1607 0.04388009
## [3,] 231.1473 0.04388151
## [4,] 231.1340 0.04388294
## [5,] 231.1206 0.04388436
## [6,] 231.1072 0.04388578
# Plot the ellipses
genderData %>%
ggplot(aes(x = Weight, y = BMI)) + geom_point()+
geom_path(data = data.frame(ellipse_comp_1), aes(x=x,y=y), col = "red") +
geom_path(data = data.frame(ellipse_comp_2), aes(x=x,y=y), col = "blue")
# Check the assignments
table(genderData$Gender, clusters(fit_with_covariance))
##
## 1 2
## Female 4540 460
## Male 386 4614
Chapter 4 - Mixture Models Beyond Gaussians
Bernoulli Mixture Models:
Bernoulli Mixture Models with flexmix:
Poisson Mixture Models:
Poisson Mixture Models with flexmix:
Example code includes:
# Create the vector of probabilities
p_cluster_1 <- c(0.8, 0.8, 0.2, 0.9)
# Create the sample for each pixel
set.seed(18102308)
pixel_1 <- sample(c(0, 1), 100, replace = TRUE, prob = c(1-p_cluster_1[1], p_cluster_1[1]))
pixel_2 <- sample(c(0, 1), 100, replace = TRUE, prob = c(1-p_cluster_1[2], p_cluster_1[2]))
pixel_3 <- sample(c(0, 1), 100, replace = TRUE, prob = c(1-p_cluster_1[3], p_cluster_1[3]))
pixel_4 <- sample(c(0, 1), 100, replace = TRUE, prob = c(1-p_cluster_1[4], p_cluster_1[4]))
# Combine the samples
sample_cluster_1 <- cbind(pixel_1, pixel_2, pixel_3, pixel_4)
# Have a look to the sample
head(sample_cluster_1)
## pixel_1 pixel_2 pixel_3 pixel_4
## [1,] 1 1 0 1
## [2,] 1 1 0 1
## [3,] 1 1 0 1
## [4,] 1 1 0 1
## [5,] 0 1 0 1
## [6,] 1 1 0 1
digitUse2 <- rowSums(digitKey[, c(1, 3, 10)]) == 1
digits_sample_2 <- digitData[digitUse2, ]
dim(digits_sample_2)
## [1] 478 256
# transform into matrix
digits_sample_2 <- as.matrix(digits_sample_2)
# dimension
dim(digits_sample_2)
## [1] 478 256
# look to the first observation
show_digit(digits_sample_2[1, ])
# look to the last observation
show_digit(digits_sample_2[nrow(digits_sample_2), ])
set.seed(1513)
# Fit Bernoulli mixture model
bernoulli_mix_model <- flexmix(digits_sample_2 ~ 1, k = 3, model = FLXMCmvbinary(),
control = list(tolerance = 1e-15, iter.max = 1000)
)
prior(bernoulli_mix_model)
## [1] 0.3117220 0.3353131 0.3529649
# Extract the parameters for each cluster
param_comp_1 <- parameters(bernoulli_mix_model, component = 1)
param_comp_2 <- parameters(bernoulli_mix_model, component = 2)
param_comp_3 <- parameters(bernoulli_mix_model, component = 3)
# Visualize the clusters
show_digit(param_comp_1)
show_digit(param_comp_2)
show_digit(param_comp_3)
set.seed(1541)
# Create the vector of lambdas
lambda_1 <- c(150, 300, 50)
# Create the sample of each crime
assault_1 <- rpois(n = 10, lambda = lambda_1[1])
robbery_1 <- rpois(n = 10, lambda = lambda_1[2])
battery_1 <- rpois(n = 10, lambda = lambda_1[3])
# Combine the results
cities_1 <- cbind(assault_1, robbery_1, battery_1)
# Check the sample
cities_1
## assault_1 robbery_1 battery_1
## [1,] 154 297 55
## [2,] 142 276 50
## [3,] 166 312 41
## [4,] 158 273 56
## [5,] 129 278 52
## [6,] 150 307 39
## [7,] 140 321 49
## [8,] 152 321 53
## [9,] 143 318 42
## [10,] 125 324 51
crimes <- readr::read_csv("./RInputFiles/CoC_crimes.csv")
## Parsed with column specification:
## cols(
## COMMUNITY = col_character(),
## ASSAULT = col_double(),
## BATTERY = col_double(),
## BURGLARY = col_double(),
## `CRIMINAL DAMAGE` = col_double(),
## `CRIMINAL TRESPASS` = col_double(),
## `DECEPTIVE PRACTICE` = col_double(),
## `MOTOR VEHICLE THEFT` = col_double(),
## NARCOTICS = col_double(),
## OTHER = col_double(),
## `OTHER OFFENSE` = col_double(),
## ROBBERY = col_double(),
## THEFT = col_double()
## )
dim(crimes)
## [1] 77 13
names(crimes) <- stringr::str_replace_all(stringr::str_to_lower(names(crimes)), " ", ".")
# Check with glimpse
glimpse(crimes)
## Observations: 77
## Variables: 13
## $ community <chr> "ALBANY PARK", "ARCHER HEIGHTS", "ARMOUR SQUARE...
## $ assault <dbl> 123, 51, 74, 169, 708, 1198, 118, 135, 337, 63,...
## $ battery <dbl> 429, 134, 184, 448, 1681, 3347, 280, 350, 850, ...
## $ burglary <dbl> 147, 92, 55, 194, 339, 517, 76, 145, 327, 64, 1...
## $ criminal.damage <dbl> 287, 114, 99, 379, 859, 1666, 150, 310, 528, 12...
## $ criminal.trespass <dbl> 38, 23, 56, 43, 228, 265, 29, 36, 88, 29, 27, 3...
## $ deceptive.practice <dbl> 137, 67, 59, 178, 310, 767, 73, 200, 314, 89, 9...
## $ motor.vehicle.theft <dbl> 176, 50, 37, 189, 281, 732, 58, 123, 411, 42, 6...
## $ narcotics <dbl> 27, 18, 9, 30, 345, 1456, 15, 22, 119, 10, 35, ...
## $ other <dbl> 107, 37, 48, 114, 584, 1261, 76, 88, 238, 22, 6...
## $ other.offense <dbl> 158, 44, 35, 164, 590, 1130, 94, 142, 339, 77, ...
## $ robbery <dbl> 144, 30, 98, 111, 349, 829, 65, 109, 172, 24, 5...
## $ theft <dbl> 690, 180, 263, 461, 1201, 2137, 239, 669, 846, ...
# Transform into a matrix, without `community`
matrix_crimes <- crimes %>%
select(-community) %>%
as.matrix()
# Check the first values
head(matrix_crimes)
## assault battery burglary criminal.damage criminal.trespass
## [1,] 123 429 147 287 38
## [2,] 51 134 92 114 23
## [3,] 74 184 55 99 56
## [4,] 169 448 194 379 43
## [5,] 708 1681 339 859 228
## [6,] 1198 3347 517 1666 265
## deceptive.practice motor.vehicle.theft narcotics other other.offense
## [1,] 137 176 27 107 158
## [2,] 67 50 18 37 44
## [3,] 59 37 9 48 35
## [4,] 178 189 30 114 164
## [5,] 310 281 345 584 590
## [6,] 767 732 1456 1261 1130
## robbery theft
## [1,] 144 690
## [2,] 30 180
## [3,] 98 263
## [4,] 111 461
## [5,] 349 1201
## [6,] 829 2137
set.seed(2017)
# Fit the Poisson mixture model
poisson_mm <- stepFlexmix(matrix_crimes ~ 1, k = 1:15, nrep = 5, model = FLXMCmvpois(),
control = list(tolerance = 1e-15, iter.max = 1000)
)
## 1 : * * * * *
## 2 : * * * * *
## 3 : * * * * *
## 4 : * * * * *
## 5 : * * * * *
## 6 : * * * * *
## 7 : * * * * *
## 8 : * * * * *
## 9 : * * * * *
## 10 : * * * * *
## 11 : * * * * *
## 12 : * * * * *
## 13 : * * * * *
## 14 : * * * * *
## 15 : * * * * *
# Select the model that minimize the BIC
best_poisson_mm <- getModel(poisson_mm, which = "BIC")
# Get the parameters into a data frame
params_lambdas <- data.frame(parameters(best_poisson_mm))
# Add the column with the type of crime
params_lambdas_crime <- params_lambdas %>%
mutate(crime = colnames(matrix_crimes))
# Plot the clusters with their lambdas
params_lambdas_crime %>%
gather(cluster, lambdas, -crime) %>%
ggplot(aes(x = crime, y = lambdas, fill = crime)) +
geom_bar(stat = "identity") +
facet_wrap(~ cluster) +
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "none")
# Add the cluster assignments
crimes_with_clusters <- crimes %>%
mutate(cluster = factor(clusters(best_poisson_mm)))
# Plot the clusters with the communities
crimes_with_clusters %>%
group_by(cluster) %>%
mutate(number = row_number()) %>%
ggplot(aes(x = cluster, y = number, col = cluster)) +
geom_text(aes(label = community), size = 2.3) +
theme(legend.position="none")
Chapter 1 - The R Package Structure
Introduction to Package Building:
Description and Namespace Files:
Optional Directories:
Example code includes:
# Use the create function to set up your first package
devtools::create("./RPackages/datasummary")
# Take a look at the files and folders in your package
dir("./RPackages/datasummary")
# Create numeric_summary() function
numeric_summary <- function(x, na.rm) {
# Include an error if x is not numeric
if(!is.numeric(x)){
stop("Data must be numeric")
}
# Create data frame
data.frame( min = min(x, na.rm = na.rm),
median = median(x, na.rm = na.rm),
sd = sd(x, na.rm = na.rm),
max = max(x, na.rm = na.rm))
}
data(airquality)
# Test numeric_summary() function
numeric_summary(airquality$Ozone, TRUE)
# What is in the R directory before adding a function?
dir("./RPackages/datasummary/R")
# Use the dump() function to write the numeric_summary function
dump("numeric_summary", file = "./RPackages/datasummary/R/numeric_summary.R")
# Verify that the file is in the correct directory
dir("./RPackages/datasummary/R")
# a package should not have the same name as an existing package and its name must only contain letters, numbers, or dots.
# What is in the package at the moment?
dir("./RPackages/datasummary")
# Add the weather data
data(Weather, package="mosaicData")
devtools::use_data(Weather, pkg = "./RPackages/datasummary")
# Add a vignette called "Generating Summaries with Data Summary"
devtools::use_vignette("Generating_Summaries_with_Data_Summary", pkg = "./RPackages/datasummary")
# What directories do you now have in your package now?
dir("./RPackages/datasummary")
data_summary <- function(x, na.rm = TRUE){
num_data <- select_if(x, .predicate = is.numeric)
map_df(num_data, .f = numeric_summary, na.rm = TRUE, .id = "ID")
}
# Write the function to the R directory
dump("data_summary", file = "./RPackages/datasummary/R/data_summary.R")
dir("./RPackages/datasummary")
Chapter 2 - Documenting Packages
Introduction to roxygen2:
How to export functions:
Documenting other elements:
Documenting a package:
Example code includes:
#' Summary of Numeric Columns
#'
#' Generate specific summaries of numeric columns in a data frame
#'
#' @param x A data frame. Non-numeric columns will be removed
#' @param na.rm A logical indicating whether missing values should be removed
#' @import purrr
#' @import dplyr
#' @importFrom tidyr gather
data_summary <- function(x, na.rm = TRUE){
num_data <- select_if(x, .predicate = is.numeric)
map_df(num_data, .f = numeric_summary, na.rm = na.rm, .id = "ID")
}
#' Summary of Numeric Columns
#'
#' Generate specific summaries of numeric columns in a data frame
#'
#' @param x A data frame. Non-numeric columns will be removed
#' @param na.rm A logical indicating whether missing values should be removed
#' @import dplyr
#' @import purrr
#' @importFrom tidyr gather
#' @export
data_summary <- function(x, na.rm = TRUE){
num_data <- select_if(x, .predicate = is.numeric)
map_df(num_data, .f = numeric_summary, na.rm = na.rm, .id = "ID")
}
#' Data Summary for Numeric Columns
#'
#' Custom summaries of numeric data in a provided data frame
#'
#' @param x A data.frame containing at least one numeric column
#' @param na.rm A logical indicating whether missing values should be removed
#' @import dplyr
#' @import purrr
#' @importFrom tidyr gather
#' @export
#' @examples
#' data_summary(iris)
#' data_summary(airquality, na.rm = FALSE)
data_summary <- function(x, na.rm = TRUE){
num_data <- select_if(x, .predicate = is.numeric)
map_df(num_data, .f = numeric_summary, na.rm = na.rm, .id = "ID")
}
# For code you use \code{text to format}
# To link to other functions you use \link[packageName]{functioName}, although note the package name is only required if the function is not in your package
# To include an unordered list you use \itemize{}. Inside the brakets you mark new items with \item followed by the item text.
#' Data Summary for Numeric Columns
#'
#' Custom summaries of numeric data in a provided data frame
#'
#' @param x A data.frame containing at least one numeric column
#' @param na.rm A logical indicating whether missing values should be removed
#' @import dplyr
#' @import purrr
#' @importFrom tidyr gather
#' @export
#' @examples
#' data_summary(iris)
#' data_summary(airquality, na.rm = FALSE)
#'
## Update the details for the return value
#' @return
#' This function returns a \code{data.frame} including columns:
#' \itemize{
#' \item ID
#' \item min
#' \item median
#' \item sd
#' \item max
#' }
#'
#' @export
data_summary <- function(x, na.rm = TRUE){
num_data <- select_if(x, .predicate = is.numeric)
map_df(num_data, .f = numeric_summary, na.rm = na.rm, .id = "ID")
}
#' Summary of Numeric Columns
#' Generate specific summaries of numeric columns in a data frame
#'
#' @param x A data frame. Non-numeric columns will be removed
#' @param na.rm A logical indicating whether missing values should be removed
#' @import dplyr
#' @import purrr
#' @importFrom tidyr gather
#' @export
#' @examples
#' data_summary(iris)
#' data_summary(airquality, na.rm = FALSE)
#'
#' @return This function returns a \code{data.frame} including columns:
#' \itemize{
#' \item ID
#' \item min
#' \item median
#' \item sd
#' \item max
#' }
#'
## Add in the author of the `data_summary()` function.
#' @author My Name <myemail@example.com>
## Update the header to link to the `summary()` function (in the `base` package).
#' @seealso \link[base]{summary}
data_summary <- function(x, na.rm = TRUE){
num_data <- select_if(x, .predicate = is.numeric)
map_df(num_data, .f = numeric_summary, na.rm = na.rm, .id = "ID")
}
#' Custom Data Summaries
#'
#' Easily generate custom data frame summaries
#'
#' @docType package
#' @name datasummary
_PACKAGE
#' Random Weather Data
#'
#' A dataset containing randomly generated weather data.
#'
#' @format A data frame of 7 rows and 3 columns
#' \describe{
#' \item{Day}{Numeric values giving day of the week, 1 = Monday, 7 = Sunday}
#' \item{Temp}{Integer values giving temperature in degrees Celsius}
#' \item{Weather}{Character values giving precipitation type, Sun if none}
#' }
#' @source Randomly generated data
weather
# Generate package documentation
document("datasummary")
# Examine the contents of the man directory
dir("datasummary/man")
# View the documentation for the data_summary function
help("data_summary")
# View the documentation for the weather dataset
help("weather")
Chapter 3 - Checking and Building R Packages
Why check an R package?
Errors, warnings, and notes:
Differences in package dependencies:
Building packages with continuous integration:
Example code includes:
# Check your package
check("datasummary")
#' Numeric Summaries
#' Summarises numeric data and returns a data frame containing the minimum value, median, standard deviation, and maximum value.
#'
#' @param x a numeric vector containing the values to summarize.
#' @param na.rm a logical value indicating whether NA values should be stripped before the computation proceeds.
numeric_summary <- function(x, na.rm){
if(!is.numeric(x)){
stop("data must be numeric")
}
data.frame( min = min(x, na.rm = na.rm),
median = median(x, na.rm = na.rm),
sd = sd(x, na.rm = na.rm),
max = max(x, na.rm = na.rm))
}
# The way in which you define variables in tidyverse package functions can cause confusion for the R CMD check, which sees column names and the name of your dataset, and flags them as "undefined global variables".
# To get around this, you can manually specify the data and its columns as a vector to utils::globalVariables(), by including a line of code similar to the following in your package-level documentation:
# utils::globalVariables(c("dataset_name", "col_name_1", "col_name_2"))
# This defines dataset_name, col_name_1, and col_name_2 as global variables, and now you shouldn't get the undefined global variables error.
#' datasummary: Custom Data Summaries
#'
#' Easily generate custom data frame summaries
#'
#' @docType package
#' @name datasummary
_PACKAGE
# Update this function call
utils::globalVariables(c("weather", "Temp"))
# Add dplyr as an imported dependency to the DESCRIPTION file
use_package("dplyr", pkg = "datasummary")
# Add purrr as an imported dependency to the DESCRIPTION file
use_package("purrr", pkg = "datasummary")
# Add tidyr as an imported dependency to the DESCRIPTION file
use_package("tidyr", pkg = "datasummary")
# Build the package
build("datasummary")
# Examine the contents of the current directory
dir("datasummary")
Chapter 4 - Adding Unit Tests to R Packages
What are unit tests and why write them?
vec1 not identical to c(1, 2). names for target but not for currentTesting errors and warnings:
Testing specific output and non-exported functions:
Grouping and running tests:
Wrap up:
Example code includes:
# Set up the test framework
use_testthat("datasummary")
# Look at the contents of the package root directory
dir("datasummary")
# Look at the contents of the new folder which has been created
dir("datasummary/tests")
# Create a summary of the iris dataset using your data_summary() function
iris_summary <- data_summary(iris)
# Count how many rows are returned
summary_rows <- nrow(iris_summary)
# Use expect_equal to test that calling data_summary() on iris returns 4 rows
expect_equal(summary_rows, 4)
result <- data_summary(weather)
# Update this test so it passes
expect_equal(result$sd, c(2.1, 3.6), tolerance = 0.1)
expected_result <- list(
ID = c("Day", "Temp"),
min = c(1L, 14L),
median = c(4L, 19L),
sd = c(2.16024689946929, 3.65148371670111),
max = c(7L, 24L)
)
# Write a passing test that compares expected_result to result
expect_equivalent(result, expected_result)
# Create a vector containing the numbers 1 through 10
my_vector <- 1:10
# Look at what happens when we apply this vector as an argument to data_summary()
data_summary(my_vector)
# Test if running data_summary() on this vector returns an error
expect_error(data_summary(my_vector))
# Run data_summary on the airquality dataset with na.rm set to FALSE
data_summary(airquality, na.rm=FALSE)
# Use expect_warning to formally test this
expect_warning(data_summary(airquality, na.rm = FALSE))
# Expected result
expected <- data.frame(min = 14L, median = 19L, sd = 3.65148371670111, max = 24L)
# Create variable result by calling numeric summary on the temp column of the weather dataset
result <- datasummary:::numeric_summary(weather$Temp, na.rm = TRUE)
# Test that the value returned matches the expected value
expect_equal(result, expected)
# Use context() and test_that() to group the tests below together
context("Test data_summary()")
test_that("data_summary() handles errors correctly", {
# Create a vector
my_vector <- 1:10
# Use expect_error()
expect_error(data_summary(my_vector))
# Use expect_warning()
expect_warning(data_summary(airquality, na.rm = FALSE))
})
# Run the tests on the datasummary package
test("datasummary")
Chapter 1 - Evaluating Your Measure with Factor Analysis
Introduction to Exploratory Factor Analysis:
Overview of the Measure Development Process:
Measure Features: Correlations and Reliability:
Example code includes:
# Load the psych package
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:lavaan':
##
## cor2cov
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
gcbs <- readRDS("./RInputFiles/GCBS_data.rds")
glimpse(gcbs)
## Observations: 2,495
## Variables: 15
## $ Q1 <int> 5, 5, 2, 5, 5, 1, 4, 5, 1, 1, 4, 5, 5, 5, 5, 4, 4, 2, 4, 5, 3, ...
## $ Q2 <int> 5, 5, 4, 4, 4, 1, 3, 4, 1, 2, 4, 5, 4, 4, 4, 4, 4, 1, 2, 2, 1, ...
## $ Q3 <int> 3, 5, 1, 1, 1, 1, 3, 3, 1, 1, 4, 1, 2, 4, 5, 2, 1, 1, 1, 1, 1, ...
## $ Q4 <int> 5, 5, 2, 2, 4, 1, 3, 3, 1, 1, 5, 5, 4, 5, 5, 5, 3, 1, 1, 3, 1, ...
## $ Q5 <int> 5, 5, 2, 4, 4, 1, 4, 4, 1, 1, 5, 5, 5, 4, 5, 5, 4, 1, 1, 4, 1, ...
## $ Q6 <int> 5, 3, 2, 5, 5, 1, 3, 5, 1, 5, 5, 5, 5, 5, 5, 4, 2, 1, 1, 3, 2, ...
## $ Q7 <int> 5, 5, 4, 4, 4, 1, 3, 5, 1, 1, 4, 3, 3, 5, 5, 5, 4, 1, 1, 2, 1, ...
## $ Q8 <int> 3, 5, 2, 1, 3, 1, 4, 5, 1, 1, 4, 1, 3, 5, 5, 1, 1, 1, 1, 3, 1, ...
## $ Q9 <int> 4, 1, 2, 4, 1, 1, 2, 5, 1, 1, 2, 1, 5, 5, 5, 3, 1, 1, 1, 4, 1, ...
## $ Q10 <int> 5, 4, 4, 5, 5, 1, 3, 5, 1, 4, 5, 5, 3, 5, 5, 5, 4, 2, 3, 4, 1, ...
## $ Q11 <int> 5, 4, 2, 5, 5, 1, 3, 5, 1, 1, 4, 5, 4, 5, 4, 4, 4, 2, 2, 4, 1, ...
## $ Q12 <int> 5, 5, 4, 5, 5, 1, 2, 5, 1, 1, 2, 5, 3, 5, 3, 5, 1, 1, 2, 2, 1, ...
## $ Q13 <int> 3, 4, 0, 1, 3, 1, 2, 3, 1, 1, 2, 1, 3, 4, 5, 4, 1, 1, 1, 4, 1, ...
## $ Q14 <int> 5, 4, 2, 4, 5, 1, 3, 4, 1, 1, 1, 5, 3, 4, 5, 5, 4, 1, 2, 4, 2, ...
## $ Q15 <int> 5, 5, 4, 5, 5, 1, 4, 5, 1, 5, 5, 5, 5, 5, 5, 5, 4, 2, 3, 4, 2, ...
# Conduct a single-factor EFA
EFA_model <- fa(gcbs)
# View the results
EFA_model
## Factor Analysis using method = minres
## Call: fa(r = gcbs)
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 h2 u2 com
## Q1 0.70 0.49 0.51 1
## Q2 0.72 0.52 0.48 1
## Q3 0.64 0.41 0.59 1
## Q4 0.77 0.59 0.41 1
## Q5 0.67 0.45 0.55 1
## Q6 0.75 0.56 0.44 1
## Q7 0.73 0.54 0.46 1
## Q8 0.65 0.43 0.57 1
## Q9 0.70 0.48 0.52 1
## Q10 0.56 0.32 0.68 1
## Q11 0.72 0.52 0.48 1
## Q12 0.79 0.62 0.38 1
## Q13 0.68 0.46 0.54 1
## Q14 0.74 0.55 0.45 1
## Q15 0.57 0.33 0.67 1
##
## MR1
## SS loadings 7.27
## Proportion Var 0.48
##
## Mean item complexity = 1
## Test of the hypothesis that 1 factor is sufficient.
##
## The degrees of freedom for the null model are 105 and the objective function was 9.31 with Chi Square of 23173.8
## The degrees of freedom for the model are 90 and the objective function was 1.93
##
## The root mean square of the residuals (RMSR) is 0.08
## The df corrected root mean square of the residuals is 0.09
##
## The harmonic number of observations is 2495 with the empirical chi square 3398.99 with prob < 0
## The total number of observations was 2495 with Likelihood Chi Square = 4809.34 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.761
## RMSEA index = 0.145 and the 90 % confidence intervals are 0.142 0.149
## BIC = 4105.36
## Fit based upon off diagonal values = 0.97
## Measures of factor score adequacy
## MR1
## Correlation of (regression) scores with factors 0.97
## Multiple R square of scores with factors 0.94
## Minimum correlation of possible factor scores 0.87
# Set up the single-factor EFA
EFA_model <- fa(gcbs)
# View the factor loadings
EFA_model$loadings
##
## Loadings:
## MR1
## Q1 0.703
## Q2 0.719
## Q3 0.638
## Q4 0.770
## Q5 0.672
## Q6 0.746
## Q7 0.734
## Q8 0.654
## Q9 0.695
## Q10 0.565
## Q11 0.719
## Q12 0.786
## Q13 0.679
## Q14 0.743
## Q15 0.574
##
## MR1
## SS loadings 7.267
## Proportion Var 0.484
# Create a path diagram of the items' factor loadings
fa.diagram(EFA_model)
# Take a look at the first few lines of the response data and their corresponding sum scores
head(gcbs)
## Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11 Q12 Q13 Q14 Q15
## 1 5 5 3 5 5 5 5 3 4 5 5 5 3 5 5
## 2 5 5 5 5 5 3 5 5 1 4 4 5 4 4 5
## 3 2 4 1 2 2 2 4 2 2 4 2 4 0 2 4
## 4 5 4 1 2 4 5 4 1 4 5 5 5 1 4 5
## 5 5 4 1 4 4 5 4 3 1 5 5 5 3 5 5
## 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
rowSums(gcbs[1:6, ])
## 1 2 3 4 5 6
## 68 65 37 55 59 15
# Then look at the first few lines of individuals' factor scores
head(EFA_model$scores)
## MR1
## [1,] 1.5614675
## [2,] 1.3432026
## [3,] -0.3960355
## [4,] 0.7478868
## [5,] 1.0435203
## [6,] -1.7290812
# To get a feel for how the factor scores are distributed, look at their summary statistics and density plot.
summary(EFA_model$scores)
## MR1
## Min. :-1.854703
## 1st Qu.:-0.783260
## Median :-0.001971
## Mean : 0.000000
## 3rd Qu.: 0.728568
## Max. : 1.949580
plot(density(EFA_model$scores, na.rm = TRUE), main = "Factor Scores")
# Basic descriptive statistics
describe(gcbs)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Q1 1 2495 3.47 1.46 4 3.59 1.48 0 5 5 -0.55 -1.10 0.03
## Q2 2 2495 2.96 1.49 3 2.96 1.48 0 5 5 -0.01 -1.40 0.03
## Q3 3 2495 2.05 1.39 1 1.82 0.00 0 5 5 0.98 -0.44 0.03
## Q4 4 2495 2.64 1.45 2 2.55 1.48 0 5 5 0.26 -1.34 0.03
## Q5 5 2495 3.25 1.47 4 3.32 1.48 0 5 5 -0.35 -1.27 0.03
## Q6 6 2495 3.11 1.51 3 3.14 1.48 0 5 5 -0.17 -1.42 0.03
## Q7 7 2495 2.67 1.51 2 2.59 1.48 0 5 5 0.28 -1.39 0.03
## Q8 8 2495 2.45 1.57 2 2.32 1.48 0 5 5 0.51 -1.30 0.03
## Q9 9 2495 2.23 1.42 2 2.05 1.48 0 5 5 0.76 -0.82 0.03
## Q10 10 2495 3.50 1.39 4 3.63 1.48 1 5 4 -0.59 -0.94 0.03
## Q11 11 2495 3.27 1.40 4 3.34 1.48 0 5 5 -0.35 -1.11 0.03
## Q12 12 2495 2.64 1.50 2 2.56 1.48 0 5 5 0.29 -1.37 0.03
## Q13 13 2495 2.10 1.38 1 1.89 0.00 0 5 5 0.89 -0.56 0.03
## Q14 14 2495 2.96 1.49 3 2.95 1.48 0 5 5 -0.02 -1.43 0.03
## Q15 15 2495 4.23 1.10 5 4.47 0.00 0 5 5 -1.56 1.71 0.02
# Graphical representation of error
error.dots(gcbs)
# Graphical representation of error
error.bars(gcbs)
# Establish two sets of indices to split the dataset
N <- nrow(gcbs)
indices <- seq(1, N)
indices_EFA <- sample(indices, floor((.5*N)))
indices_CFA <- indices[!(indices %in% indices_EFA)]
# Use those indices to split the dataset into halves for your EFA and CFA
gcbs_EFA <- gcbs[indices_EFA, ]
gcbs_CFA <- gcbs[indices_CFA, ]
# Use the indices from the previous exercise to create a grouping variable
group_var <- vector("numeric", nrow(gcbs))
group_var[indices_EFA] <- 1
group_var[indices_CFA] <- 2
# Bind that grouping variable onto the gcbs dataset
gcbs_grouped <- cbind(gcbs, group_var)
# Compare stats across groups
describeBy(gcbs_grouped, group = group_var)
##
## Descriptive statistics by group
## group: 1
## vars n mean sd median trimmed mad min max range skew kurtosis
## Q1 1 1247 3.48 1.44 4 3.59 1.48 0 5 5 -0.54 -1.08
## Q2 2 1247 2.99 1.48 3 2.99 1.48 0 5 5 -0.02 -1.37
## Q3 3 1247 2.07 1.38 1 1.84 0.00 0 5 5 0.96 -0.46
## Q4 4 1247 2.62 1.44 2 2.53 1.48 0 5 5 0.26 -1.33
## Q5 5 1247 3.23 1.47 4 3.30 1.48 0 5 5 -0.33 -1.28
## Q6 6 1247 3.13 1.50 3 3.16 1.48 0 5 5 -0.22 -1.40
## Q7 7 1247 2.66 1.51 2 2.58 1.48 0 5 5 0.29 -1.38
## Q8 8 1247 2.49 1.57 2 2.37 1.48 0 5 5 0.46 -1.34
## Q9 9 1247 2.21 1.39 2 2.01 1.48 0 5 5 0.79 -0.76
## Q10 10 1247 3.51 1.40 4 3.63 1.48 1 5 4 -0.59 -0.96
## Q11 11 1247 3.30 1.38 4 3.38 1.48 0 5 5 -0.39 -1.05
## Q12 12 1247 2.63 1.51 2 2.54 1.48 0 5 5 0.31 -1.36
## Q13 13 1247 2.14 1.40 1 1.94 0.00 0 5 5 0.82 -0.69
## Q14 14 1247 2.95 1.49 3 2.94 1.48 0 5 5 -0.01 -1.41
## Q15 15 1247 4.24 1.11 5 4.48 0.00 1 5 4 -1.57 1.67
## group_var 16 1247 1.00 0.00 1 1.00 0.00 1 1 0 NaN NaN
## se
## Q1 0.04
## Q2 0.04
## Q3 0.04
## Q4 0.04
## Q5 0.04
## Q6 0.04
## Q7 0.04
## Q8 0.04
## Q9 0.04
## Q10 0.04
## Q11 0.04
## Q12 0.04
## Q13 0.04
## Q14 0.04
## Q15 0.03
## group_var 0.00
## ------------------------------------------------------------
## group: 2
## vars n mean sd median trimmed mad min max range skew kurtosis
## Q1 1 1248 3.47 1.47 4 3.59 1.48 0 5 5 -0.55 -1.13
## Q2 2 1248 2.94 1.51 3 2.93 1.48 0 5 5 0.01 -1.43
## Q3 3 1248 2.03 1.39 1 1.80 0.00 0 5 5 1.00 -0.43
## Q4 4 1248 2.66 1.46 2 2.57 1.48 0 5 5 0.26 -1.35
## Q5 5 1248 3.28 1.47 4 3.35 1.48 0 5 5 -0.37 -1.26
## Q6 6 1248 3.09 1.51 3 3.11 1.48 0 5 5 -0.11 -1.44
## Q7 7 1248 2.67 1.51 2 2.59 1.48 0 5 5 0.28 -1.40
## Q8 8 1248 2.41 1.57 2 2.27 1.48 0 5 5 0.57 -1.25
## Q9 9 1248 2.26 1.45 2 2.08 1.48 0 5 5 0.73 -0.89
## Q10 10 1248 3.50 1.38 4 3.62 1.48 1 5 4 -0.58 -0.93
## Q11 11 1248 3.23 1.42 3 3.29 1.48 0 5 5 -0.31 -1.17
## Q12 12 1248 2.66 1.50 2 2.58 1.48 0 5 5 0.26 -1.37
## Q13 13 1248 2.06 1.37 1 1.85 0.00 0 5 5 0.95 -0.42
## Q14 14 1248 2.96 1.49 3 2.95 1.48 0 5 5 -0.03 -1.45
## Q15 15 1248 4.21 1.10 5 4.45 0.00 0 5 5 -1.56 1.74
## group_var 16 1248 2.00 0.00 2 2.00 0.00 2 2 0 NaN NaN
## se
## Q1 0.04
## Q2 0.04
## Q3 0.04
## Q4 0.04
## Q5 0.04
## Q6 0.04
## Q7 0.04
## Q8 0.04
## Q9 0.04
## Q10 0.04
## Q11 0.04
## Q12 0.04
## Q13 0.04
## Q14 0.04
## Q15 0.03
## group_var 0.00
statsBy(gcbs_grouped, group = "group_var")
## Statistics within and between groups
## Call: statsBy(data = gcbs_grouped, group = "group_var")
## Intraclass Correlation 1 (Percentage of variance due to groups)
## Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8
## 0 0 0 0 0 0 0 0
## Q9 Q10 Q11 Q12 Q13 Q14 Q15 group_var
## 0 0 0 0 0 0 0 1
## Intraclass Correlation 2 (Reliability of group differences)
## Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8
## -93.70 -0.60 -1.00 -1.07 -0.87 -1.35 -104.83 0.48
## Q9 Q10 Q11 Q12 Q13 Q14 Q15 group_var
## -0.36 -84.52 0.40 -2.47 0.49 -337.47 -1.93 1.00
## eta^2 between groups
## Q1.bg Q2.bg Q3.bg Q4.bg Q5.bg Q6.bg Q7.bg Q8.bg Q9.bg Q10.bg Q11.bg
## 0 0 0 0 0 0 0 0 0 0 0
## Q12.bg Q13.bg Q14.bg Q15.bg
## 0 0 0 0
##
## To see the correlations between and within groups, use the short=FALSE option in your print statement.
## Many results are not shown directly. To see specific objects select from the following list:
## mean sd n F ICC1 ICC2 ci1 ci2 raw rbg pbg rwg nw ci.wg pwg etabg etawg nwg nG Call
# Take a look at some correlation data
lowerCor(gcbs, use = "pairwise.complete.obs")
## Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11
## Q1 1.00
## Q2 0.53 1.00
## Q3 0.36 0.40 1.00
## Q4 0.52 0.53 0.50 1.00
## Q5 0.48 0.46 0.40 0.57 1.00
## Q6 0.63 0.55 0.40 0.61 0.50 1.00
## Q7 0.47 0.67 0.42 0.57 0.45 0.54 1.00
## Q8 0.39 0.38 0.78 0.49 0.41 0.41 0.41 1.00
## Q9 0.42 0.49 0.49 0.56 0.46 0.48 0.53 0.48 1.00
## Q10 0.44 0.38 0.32 0.40 0.43 0.41 0.39 0.36 0.37 1.00
## Q11 0.64 0.52 0.34 0.52 0.49 0.62 0.49 0.37 0.46 0.45 1.00
## Q12 0.52 0.72 0.44 0.60 0.49 0.59 0.75 0.42 0.57 0.40 0.55
## Q13 0.38 0.40 0.71 0.51 0.43 0.42 0.45 0.76 0.54 0.37 0.40
## Q14 0.53 0.50 0.43 0.60 0.54 0.55 0.52 0.45 0.55 0.41 0.56
## Q15 0.51 0.40 0.27 0.39 0.45 0.47 0.39 0.31 0.32 0.45 0.54
## Q12 Q13 Q14 Q15
## Q12 1.00
## Q13 0.49 1.00
## Q14 0.56 0.50 1.00
## Q15 0.41 0.30 0.46 1.00
# Take a look at some correlation data
corr.test(gcbs, use = "pairwise.complete.obs")$p
## Q1 Q2 Q3 Q4 Q5
## Q1 0.000000e+00 1.038105e-175 2.525793e-74 1.746323e-174 5.801103e-143
## Q2 1.384140e-177 0.000000e+00 6.236650e-93 8.388030e-183 1.237758e-127
## Q3 3.608276e-75 3.282447e-94 0.000000e+00 1.087371e-155 1.282718e-94
## Q4 2.359896e-176 1.075388e-184 1.647531e-157 0.000000e+00 7.309882e-214
## Q5 1.054746e-144 2.578663e-129 6.108182e-96 8.032837e-216 0.000000e+00
## Q6 1.477903e-277 1.757495e-198 1.248831e-96 3.037754e-253 1.470071e-154
## Q7 8.142449e-139 0.000000e+00 5.318107e-109 5.260693e-212 2.336455e-122
## Q8 1.549786e-91 5.579984e-85 0.000000e+00 1.898345e-150 1.224898e-101
## Q9 4.344797e-106 3.376084e-148 3.104324e-151 3.108940e-210 1.462606e-133
## Q10 3.550942e-116 9.131376e-87 6.760094e-61 1.298818e-98 1.476449e-111
## Q11 9.499179e-292 1.822105e-173 2.580298e-69 9.037626e-175 2.540009e-153
## Q12 9.097129e-175 0.000000e+00 1.005698e-117 1.550592e-244 2.612164e-152
## Q13 9.052542e-87 1.297778e-96 0.000000e+00 1.798859e-167 1.168044e-111
## Q14 1.912163e-184 3.969981e-158 1.103607e-113 8.219433e-248 4.008639e-188
## Q15 9.793389e-162 3.129498e-96 1.676489e-41 1.255385e-93 3.849304e-125
## Q6 Q7 Q8 Q9 Q10
## Q1 1.448345e-275 4.315498e-137 2.634636e-90 1.433783e-104 1.420377e-114
## Q2 1.493871e-196 0.000000e+00 6.695981e-84 1.958129e-146 1.267356e-85
## Q3 2.997195e-95 1.861338e-107 0.000000e+00 1.862595e-149 2.704037e-60
## Q4 2.916244e-251 4.734624e-210 1.120024e-148 2.766957e-208 3.376926e-97
## Q5 9.408454e-153 9.813111e-121 3.429715e-100 7.459290e-132 5.462862e-110
## Q6 0.000000e+00 2.582592e-187 1.111480e-101 4.903141e-145 2.966233e-99
## Q7 3.149502e-189 0.000000e+00 1.110564e-101 4.177923e-178 1.437786e-89
## Q8 3.704932e-103 3.582463e-103 0.000000e+00 3.981155e-142 1.778530e-78
## Q9 8.755609e-147 5.497267e-180 7.372509e-144 0.000000e+00 2.784487e-79
## Q10 1.098605e-100 9.585238e-91 2.223163e-79 3.093874e-80 0.000000e+00
## Q11 4.981130e-268 8.601767e-153 2.643708e-82 1.655959e-131 4.633595e-125
## Q12 2.403960e-231 0.000000e+00 5.764588e-109 7.071640e-216 1.561964e-98
## Q13 1.353745e-109 7.858024e-124 0.000000e+00 2.581509e-186 4.747674e-81
## Q14 7.044729e-195 6.545724e-170 2.485870e-124 2.858794e-201 3.168647e-104
## Q15 1.340087e-134 7.843219e-91 4.328489e-55 4.516727e-61 2.157366e-124
## Q11 Q12 Q13 Q14 Q15
## Q1 9.404187e-290 6.597467e-173 1.267356e-85 1.472366e-182 6.659505e-160
## Q2 1.293694e-171 0.000000e+00 2.997195e-95 2.659887e-156 6.884895e-95
## Q3 1.548179e-68 4.123361e-116 0.000000e+00 4.304066e-112 1.676489e-41
## Q4 6.597467e-173 1.457557e-242 1.241212e-165 7.808462e-246 2.259693e-92
## Q5 1.600206e-151 1.593420e-150 4.438568e-110 3.206912e-186 1.809173e-123
## Q6 4.831696e-266 2.235683e-229 4.873482e-108 5.847125e-193 6.968453e-133
## Q7 5.333096e-151 0.000000e+00 3.378951e-122 4.582007e-168 1.254915e-89
## Q8 2.908079e-81 1.959960e-107 0.000000e+00 1.093783e-122 1.298547e-54
## Q9 8.114201e-130 6.505909e-214 2.039392e-184 2.458563e-199 2.258363e-60
## Q10 2.131454e-123 3.904910e-97 4.747674e-80 1.013967e-102 9.708145e-123
## Q11 0.000000e+00 1.571210e-193 1.584049e-94 1.610933e-205 1.319712e-186
## Q12 1.870488e-195 0.000000e+00 9.395071e-146 1.350081e-206 3.398356e-100
## Q13 7.920246e-96 1.648258e-147 0.000000e+00 6.585550e-154 2.751475e-52
## Q14 1.851647e-207 1.534183e-208 1.013162e-155 0.000000e+00 2.883420e-130
## Q15 1.629274e-188 1.171847e-101 1.375737e-52 5.766839e-132 0.000000e+00
# Take a look at some correlation data
corr.test(gcbs, use = "pairwise.complete.obs")$ci
## lower r upper p
## Q1-Q2 0.4970162 0.5259992 0.5538098 1.384140e-177
## Q1-Q3 0.3206223 0.3553928 0.3892067 3.608276e-75
## Q1-Q4 0.4953852 0.5244323 0.5523079 2.359896e-176
## Q1-Q5 0.4503342 0.4810747 0.5106759 1.054746e-144
## Q1-Q6 0.6071117 0.6313131 0.6543444 1.477903e-277
## Q1-Q7 0.4412058 0.4722710 0.5022057 8.142449e-139
## Q1-Q8 0.3564216 0.3902059 0.4229712 1.549786e-91
## Q1-Q9 0.3850453 0.4179718 0.4498355 4.344797e-106
## Q1-Q10 0.4034438 0.4357865 0.4670415 3.550942e-116
## Q1-Q11 0.6199265 0.6435136 0.6659388 9.499179e-292
## Q1-Q12 0.4932727 0.5224025 0.5503620 9.097129e-175
## Q1-Q13 0.3464313 0.3805006 0.4135673 9.052542e-87
## Q1-Q14 0.5059498 0.5345780 0.5620298 1.912163e-184
## Q1-Q15 0.4753633 0.5051815 0.5338405 9.793389e-162
## Q2-Q3 0.3618855 0.3955108 0.4281083 3.282447e-94
## Q2-Q4 0.5062706 0.5348860 0.5623248 1.075388e-184
## Q2-Q5 0.4259018 0.4574975 0.4879788 2.578663e-129
## Q2-Q6 0.5234810 0.5513960 0.5781285 1.757495e-198
## Q2-Q7 0.6501266 0.6722188 0.6931753 0.000000e+00
## Q2-Q8 0.3425926 0.3767693 0.4099501 5.579984e-85
## Q2-Q9 0.4556319 0.4861810 0.5155863 3.376084e-148
## Q2-Q10 0.3464233 0.3804928 0.4135598 9.131376e-87
## Q2-Q11 0.4915283 0.5207263 0.5487548 1.822105e-173
## Q2-Q12 0.6962013 0.7158851 0.7344931 0.000000e+00
## Q2-Q13 0.3667134 0.4001964 0.4326439 1.297778e-96
## Q2-Q14 0.4702225 0.5002339 0.5290898 3.969981e-158
## Q2-Q15 0.3659505 0.3994560 0.4319274 3.129498e-96
## Q3-Q4 0.4693335 0.4993781 0.5282679 1.647531e-157
## Q3-Q5 0.3653695 0.3988923 0.4313817 6.108182e-96
## Q3-Q6 0.3667467 0.4002287 0.4326752 1.248831e-96
## Q3-Q7 0.3904688 0.4232258 0.4549125 5.318107e-109
## Q3-Q8 0.7683496 0.7839542 0.7986273 0.000000e+00
## Q3-Q9 0.4601647 0.4905484 0.5197845 3.104324e-151
## Q3-Q10 0.2853436 0.3209913 0.3557521 6.760094e-61
## Q3-Q11 0.3066780 0.3418064 0.3760050 2.580298e-69
## Q3-Q12 0.4061739 0.4384278 0.4695906 1.005698e-117
## Q3-Q13 0.6919756 0.7118867 0.7307155 0.000000e+00
## Q3-Q14 0.3989973 0.4314834 0.4628876 1.103607e-113
## Q3-Q15 0.2285790 0.2654400 0.3015410 1.676489e-41
## Q4-Q5 0.5438704 0.5709273 0.5967985 8.032837e-216
## Q4-Q6 0.5837641 0.6090539 0.6331630 3.037754e-253
## Q4-Q7 0.5394959 0.5667395 0.5927977 5.260693e-212
## Q4-Q8 0.4589969 0.4894234 0.5187032 1.898345e-150
## Q4-Q9 0.5374441 0.5647747 0.5909202 3.108940e-210
## Q4-Q10 0.3706739 0.4040387 0.4363621 1.298818e-98
## Q4-Q11 0.4932765 0.5224062 0.5503655 9.037626e-175
## Q4-Q12 0.5749365 0.6006273 0.6251350 1.550592e-244
## Q4-Q13 0.4833700 0.5128834 0.5412322 1.798859e-167
## Q4-Q14 0.5782876 0.6038268 0.6281838 8.219433e-248
## Q4-Q15 0.3607035 0.3943633 0.4269973 1.255385e-93
## Q5-Q6 0.4650551 0.4952588 0.5243108 1.470071e-154
## Q5-Q7 0.4142088 0.4461981 0.4770864 2.336455e-122
## Q5-Q8 0.3765708 0.4097576 0.4418941 1.224898e-101
## Q5-Q9 0.4328328 0.4641904 0.4944261 1.462606e-133
## Q5-Q10 0.3951535 0.4277623 0.4592945 1.476449e-111
## Q5-Q11 0.4632435 0.4935141 0.5226345 2.540009e-153
## Q5-Q12 0.4617541 0.4920795 0.5212560 2.612164e-152
## Q5-Q13 0.3953385 0.4279414 0.4594675 1.168044e-111
## Q5-Q14 0.5106389 0.5390785 0.5663399 4.008639e-188
## Q5-Q15 0.4189383 0.4507697 0.4814945 3.849304e-125
## Q6-Q7 0.5120337 0.5404170 0.5676214 3.149502e-189
## Q6-Q8 0.3794902 0.4125879 0.4446310 3.704932e-103
## Q6-Q9 0.4534992 0.4841255 0.5136099 8.755609e-147
## Q6-Q10 0.3747259 0.4079687 0.4401639 1.098605e-100
## Q6-Q11 0.5981808 0.6228032 0.6462508 4.981130e-268
## Q6-Q12 0.5610551 0.5873651 0.6124896 2.403960e-231
## Q6-Q13 0.3915639 0.4242864 0.4559371 1.353745e-109
## Q6-Q14 0.5190730 0.5471694 0.5740847 7.044729e-195
## Q6-Q15 0.4345044 0.4658040 0.4959800 1.340087e-134
## Q7-Q8 0.3795181 0.4126150 0.4446571 3.582463e-103
## Q7-Q9 0.5001718 0.5290301 0.5567146 5.497267e-180
## Q7-Q10 0.3547857 0.3886172 0.4214323 9.585238e-91
## Q7-Q11 0.4624648 0.4927641 0.5219138 8.601767e-153
## Q7-Q12 0.7365820 0.7540288 0.7704729 0.000000e+00
## Q7-Q13 0.4167211 0.4486267 0.4794284 7.858024e-124
## Q7-Q14 0.4867147 0.5160994 0.5443174 6.545724e-170
## Q7-Q15 0.3549662 0.3887925 0.4216021 7.843219e-91
## Q8-Q9 0.4490408 0.4798277 0.5094765 7.372509e-144
## Q8-Q10 0.3302521 0.3647668 0.3983073 2.223163e-79
## Q8-Q11 0.3367608 0.3710987 0.4044508 2.643708e-82
## Q8-Q12 0.3904041 0.4231631 0.4548520 5.764588e-109
## Q8-Q13 0.7398147 0.7570774 0.7733440 0.000000e+00
## Q8-Q14 0.4175690 0.4494462 0.4802185 2.485870e-124
## Q8-Q15 0.2696028 0.3056115 0.3407668 4.328489e-55
## Q9-Q10 0.3321729 0.3666358 0.4001210 3.093874e-80
## Q9-Q11 0.4294991 0.4609717 0.4913259 1.655959e-131
## Q9-Q12 0.5439334 0.5709876 0.5968561 7.071640e-216
## Q9-Q13 0.5083417 0.5368740 0.5642288 2.581509e-186
## Q9-Q14 0.5268510 0.5546263 0.5812182 2.858794e-201
## Q9-Q15 0.2858045 0.3214413 0.3561903 4.516727e-61
## Q10-Q11 0.4188025 0.4506384 0.4813679 4.633595e-125
## Q10-Q12 0.3705162 0.4038857 0.4362140 1.561964e-98
## Q10-Q13 0.3339871 0.3684007 0.4018335 4.747674e-81
## Q10-Q14 0.3815258 0.4145611 0.4465387 3.168647e-104
## Q10-Q15 0.4176732 0.4495470 0.4803157 2.157366e-124
## Q11-Q12 0.5197817 0.5478491 0.5747350 1.870488e-195
## Q11-Q13 0.3651436 0.3986730 0.4311694 7.920246e-96
## Q11-Q14 0.5342028 0.5616704 0.5879532 1.851647e-207
## Q11-Q15 0.5111332 0.5395529 0.5667941 1.629274e-188
## Q12-Q13 0.4545949 0.4851817 0.5146255 1.648258e-147
## Q12-Q14 0.5354702 0.5628844 0.5891136 1.534183e-208
## Q12-Q15 0.3766079 0.4097936 0.4419289 1.171847e-101
## Q13-Q14 0.4667464 0.4968874 0.5258754 1.013162e-155
## Q13-Q15 0.2625236 0.2986885 0.3340155 1.375737e-52
## Q14-Q15 0.4302457 0.4616926 0.4920203 5.766839e-132
# Estimate coefficient alpha
alpha(gcbs)
##
## Reliability analysis
## Call: alpha(x = gcbs)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.93 0.93 0.94 0.48 14 0.002 2.9 1 0.47
##
## lower alpha upper 95% confidence boundaries
## 0.93 0.93 0.94
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## Q1 0.93 0.93 0.94 0.48 13 0.0021 0.0105 0.46
## Q2 0.93 0.93 0.94 0.48 13 0.0021 0.0099 0.47
## Q3 0.93 0.93 0.94 0.49 13 0.0020 0.0084 0.48
## Q4 0.93 0.93 0.94 0.47 13 0.0022 0.0105 0.46
## Q5 0.93 0.93 0.94 0.48 13 0.0021 0.0112 0.48
## Q6 0.93 0.93 0.94 0.48 13 0.0021 0.0104 0.46
## Q7 0.93 0.93 0.94 0.48 13 0.0021 0.0098 0.47
## Q8 0.93 0.93 0.94 0.48 13 0.0020 0.0086 0.49
## Q9 0.93 0.93 0.94 0.48 13 0.0021 0.0108 0.46
## Q10 0.93 0.93 0.94 0.49 14 0.0020 0.0102 0.49
## Q11 0.93 0.93 0.94 0.48 13 0.0021 0.0104 0.46
## Q12 0.93 0.93 0.94 0.47 13 0.0022 0.0093 0.46
## Q13 0.93 0.93 0.94 0.48 13 0.0021 0.0092 0.48
## Q14 0.93 0.93 0.94 0.48 13 0.0021 0.0109 0.46
## Q15 0.93 0.93 0.94 0.49 14 0.0020 0.0095 0.49
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## Q1 2495 0.73 0.73 0.70 0.68 3.5 1.5
## Q2 2495 0.74 0.74 0.72 0.69 3.0 1.5
## Q3 2495 0.68 0.67 0.66 0.62 2.0 1.4
## Q4 2495 0.78 0.78 0.76 0.74 2.6 1.5
## Q5 2495 0.70 0.70 0.67 0.65 3.3 1.5
## Q6 2495 0.76 0.76 0.74 0.72 3.1 1.5
## Q7 2495 0.75 0.75 0.73 0.70 2.7 1.5
## Q8 2495 0.69 0.69 0.68 0.63 2.5 1.6
## Q9 2495 0.72 0.72 0.69 0.67 2.2 1.4
## Q10 2495 0.61 0.61 0.57 0.55 3.5 1.4
## Q11 2495 0.74 0.74 0.72 0.69 3.3 1.4
## Q12 2495 0.79 0.79 0.79 0.75 2.6 1.5
## Q13 2495 0.71 0.71 0.70 0.66 2.1 1.4
## Q14 2495 0.76 0.76 0.74 0.71 3.0 1.5
## Q15 2495 0.60 0.62 0.58 0.56 4.2 1.1
##
## Non missing response frequency for each item
## 0 1 2 3 4 5 miss
## Q1 0.00 0.16 0.12 0.12 0.27 0.32 0
## Q2 0.01 0.23 0.19 0.16 0.20 0.22 0
## Q3 0.00 0.55 0.13 0.12 0.10 0.10 0
## Q4 0.00 0.32 0.18 0.15 0.20 0.14 0
## Q5 0.00 0.19 0.14 0.13 0.28 0.26 0
## Q6 0.00 0.23 0.15 0.15 0.23 0.24 0
## Q7 0.00 0.33 0.19 0.13 0.18 0.17 0
## Q8 0.00 0.44 0.12 0.14 0.12 0.18 0
## Q9 0.00 0.45 0.19 0.12 0.12 0.11 0
## Q10 0.00 0.14 0.12 0.14 0.30 0.30 0
## Q11 0.00 0.16 0.14 0.19 0.27 0.24 0
## Q12 0.00 0.34 0.18 0.15 0.17 0.17 0
## Q13 0.01 0.51 0.15 0.15 0.10 0.09 0
## Q14 0.00 0.25 0.17 0.15 0.22 0.20 0
## Q15 0.00 0.05 0.05 0.08 0.27 0.55 0
# Calculate split-half reliability
splitHalf(gcbs)
## Split half reliabilities
## Call: splitHalf(r = gcbs)
##
## Maximum split half reliability (lambda 4) = 0.95
## Guttman lambda 6 = 0.94
## Average split half reliability = 0.93
## Guttman lambda 3 (alpha) = 0.93
## Guttman lambda 2 = 0.93
## Minimum split half reliability (beta) = 0.86
## Average interitem r = 0.48 with median = 0.47
Chapter 2 - Multidimensional EFA
Determining dimensionality:
Understanding multidimensional data:
Investigating model fit:
Example code includes:
data(bfi, package="psych")
glimpse(bfi)
## Observations: 2,800
## Variables: 28
## $ A1 <int> 2, 2, 5, 4, 2, 6, 2, 4, 4, 2, 4, 2, 5, 5, 4, 4, 4, 5, 4, ...
## $ A2 <int> 4, 4, 4, 4, 3, 6, 5, 3, 3, 5, 4, 5, 5, 5, 5, 3, 6, 5, 4, ...
## $ A3 <int> 3, 5, 5, 6, 3, 5, 5, 1, 6, 6, 5, 5, 5, 5, 2, 6, 6, 5, 5, ...
## $ A4 <int> 4, 2, 4, 5, 4, 6, 3, 5, 3, 6, 6, 5, 6, 6, 2, 6, 2, 4, 4, ...
## $ A5 <int> 4, 5, 4, 5, 5, 5, 5, 1, 3, 5, 5, 5, 4, 6, 1, 3, 5, 5, 3, ...
## $ C1 <int> 2, 5, 4, 4, 4, 6, 5, 3, 6, 6, 4, 5, 5, 4, 5, 5, 4, 5, 5, ...
## $ C2 <int> 3, 4, 5, 4, 4, 6, 4, 2, 6, 5, 3, 4, 4, 4, 5, 5, 4, 5, 4, ...
## $ C3 <int> 3, 4, 4, 3, 5, 6, 4, 4, 3, 6, 5, 5, 3, 4, 5, 5, 4, 5, 5, ...
## $ C4 <int> 4, 3, 2, 5, 3, 1, 2, 2, 4, 2, 3, 4, 2, 2, 2, 3, 4, 4, 4, ...
## $ C5 <int> 4, 4, 5, 5, 2, 3, 3, 4, 5, 1, 2, 5, 2, 1, 2, 5, 4, 3, 6, ...
## $ E1 <int> 3, 1, 2, 5, 2, 2, 4, 3, 5, 2, 1, 3, 3, 2, 3, 1, 1, 2, 1, ...
## $ E2 <int> 3, 1, 4, 3, 2, 1, 3, 6, 3, 2, 3, 3, 3, 2, 4, 1, 2, 2, 2, ...
## $ E3 <int> 3, 6, 4, 4, 5, 6, 4, 4, NA, 4, 2, 4, 3, 4, 3, 6, 5, 4, 4,...
## $ E4 <int> 4, 4, 4, 4, 4, 5, 5, 2, 4, 5, 5, 5, 2, 6, 6, 6, 5, 6, 5, ...
## $ E5 <int> 4, 3, 5, 4, 5, 6, 5, 1, 3, 5, 4, 4, 4, 5, 5, 4, 5, 6, 5, ...
## $ N1 <int> 3, 3, 4, 2, 2, 3, 1, 6, 5, 5, 3, 4, 1, 1, 2, 4, 4, 6, 5, ...
## $ N2 <int> 4, 3, 5, 5, 3, 5, 2, 3, 5, 5, 3, 5, 2, 1, 4, 5, 4, 5, 6, ...
## $ N3 <int> 2, 3, 4, 2, 4, 2, 2, 2, 2, 5, 4, 3, 2, 1, 2, 4, 4, 5, 5, ...
## $ N4 <int> 2, 5, 2, 4, 4, 2, 1, 6, 3, 2, 2, 2, 2, 2, 2, 5, 4, 4, 5, ...
## $ N5 <int> 3, 5, 3, 1, 3, 3, 1, 4, 3, 4, 3, NA, 2, 1, 3, 5, 5, 4, 2,...
## $ O1 <int> 3, 4, 4, 3, 3, 4, 5, 3, 6, 5, 5, 4, 4, 5, 5, 6, 5, 5, 4, ...
## $ O2 <int> 6, 2, 2, 3, 3, 3, 2, 2, 6, 1, 3, 6, 2, 3, 2, 6, 1, 1, 2, ...
## $ O3 <int> 3, 4, 5, 4, 4, 5, 5, 4, 6, 5, 5, 4, 4, 4, 5, 6, 5, 4, 2, ...
## $ O4 <int> 4, 3, 5, 3, 3, 6, 6, 5, 6, 5, 6, 5, 5, 4, 5, 3, 6, 5, 4, ...
## $ O5 <int> 3, 3, 2, 5, 3, 1, 1, 3, 1, 2, 3, 4, 2, 4, 5, 2, 3, 4, 2, ...
## $ gender <int> 1, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, ...
## $ education <int> NA, NA, NA, NA, NA, 3, NA, 2, 1, NA, 1, NA, NA, NA, 1, NA...
## $ age <int> 16, 18, 17, 17, 17, 21, 18, 19, 19, 17, 21, 16, 16, 16, 1...
# Establish two sets of indices to split the dataset
N <- nrow(bfi)
indices <- seq(1, N)
indices_EFA <- sample(indices, floor((.5*N)))
indices_CFA <- indices[!(indices %in% indices_EFA)]
# Use those indices to split the dataset into halves for your EFA and CFA
bfi_EFA <- bfi[indices_EFA, ]
bfi_CFA <- bfi[indices_CFA, ]
# Calculate the correlation matrix first
bfi_EFA_cor <- cor(bfi_EFA, use = "pairwise.complete.obs")
# Then use that correlation matrix to calculate eigenvalues
eigenvals <- eigen(bfi_EFA_cor)
# Look at the eigenvalues returned
eigenvals$values
## [1] 5.0300529 2.8855687 2.1326114 1.8423851 1.5839370 1.3565596 1.1281967
## [8] 0.8876166 0.8368945 0.7759991 0.7411295 0.7223954 0.6965653 0.6688663
## [15] 0.6495947 0.6406188 0.5569057 0.5550641 0.5408830 0.5243398 0.4913524
## [22] 0.4769243 0.4467000 0.4206217 0.4021440 0.3781987 0.3609526 0.2669220
# Then use that correlation matrix to create the scree plot
scree(bfi_EFA_cor, factors = FALSE)
# Run the EFA with six factors (as indicated by your scree plot)
EFA_model <- fa(bfi_EFA, nfactors=6)
## Loading required namespace: GPArotation
# View results from the model object
EFA_model
## Factor Analysis using method = minres
## Call: fa(r = bfi_EFA, nfactors = 6)
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR2 MR5 MR3 MR1 MR4 MR6 h2 u2 com
## A1 0.07 -0.39 0.05 -0.03 0.02 0.51 0.401 0.60 1.9
## A2 0.04 0.60 0.09 -0.04 0.05 -0.19 0.445 0.56 1.3
## A3 -0.04 0.65 0.03 -0.06 0.05 -0.01 0.484 0.52 1.0
## A4 -0.07 0.44 0.19 -0.06 -0.12 0.02 0.267 0.73 1.6
## A5 -0.17 0.59 0.01 -0.12 0.10 0.08 0.494 0.51 1.4
## C1 0.04 0.00 0.54 0.12 0.20 0.10 0.367 0.63 1.4
## C2 0.06 0.12 0.59 0.21 0.11 0.17 0.440 0.56 1.6
## C3 0.01 0.11 0.54 0.12 0.01 0.09 0.316 0.68 1.2
## C4 0.07 0.05 -0.69 0.11 0.04 0.19 0.553 0.45 1.2
## C5 0.11 0.00 -0.57 0.19 0.10 0.01 0.430 0.57 1.4
## E1 -0.13 -0.14 0.07 0.58 -0.10 0.07 0.395 0.61 1.3
## E2 0.07 -0.09 -0.03 0.69 -0.10 0.03 0.593 0.41 1.1
## E3 0.06 0.26 -0.02 -0.30 0.41 0.13 0.471 0.53 2.9
## E4 -0.04 0.38 0.01 -0.48 0.01 0.26 0.567 0.43 2.5
## E5 0.15 0.07 0.24 -0.35 0.31 0.03 0.398 0.60 3.3
## N1 0.80 -0.10 -0.01 -0.09 -0.03 0.04 0.641 0.36 1.1
## N2 0.83 -0.10 0.02 -0.08 0.01 -0.03 0.661 0.34 1.1
## N3 0.70 0.10 -0.07 0.12 0.04 0.02 0.568 0.43 1.1
## N4 0.43 0.08 -0.15 0.38 0.05 -0.04 0.458 0.54 2.3
## N5 0.53 0.22 0.01 0.21 -0.15 0.04 0.419 0.58 1.9
## O1 -0.05 -0.01 0.04 0.00 0.58 0.08 0.358 0.64 1.1
## O2 0.13 0.19 -0.10 0.01 -0.38 0.24 0.254 0.75 2.7
## O3 -0.01 0.06 0.01 -0.07 0.67 0.02 0.507 0.49 1.0
## O4 0.11 0.17 -0.03 0.39 0.35 -0.06 0.279 0.72 2.6
## O5 0.05 0.10 -0.05 -0.01 -0.46 0.28 0.293 0.71 1.8
## gender 0.24 0.23 0.14 -0.11 -0.17 -0.17 0.154 0.85 4.8
## education 0.00 -0.08 0.04 -0.01 0.09 -0.21 0.057 0.94 1.8
## age -0.03 0.04 0.10 -0.11 -0.01 -0.28 0.109 0.89 1.7
##
## MR2 MR5 MR3 MR1 MR4 MR6
## SS loadings 2.58 2.14 2.05 2.01 1.82 0.78
## Proportion Var 0.09 0.08 0.07 0.07 0.07 0.03
## Cumulative Var 0.09 0.17 0.24 0.31 0.38 0.41
## Proportion Explained 0.23 0.19 0.18 0.18 0.16 0.07
## Cumulative Proportion 0.23 0.41 0.59 0.77 0.93 1.00
##
## With factor correlations of
## MR2 MR5 MR3 MR1 MR4 MR6
## MR2 1.00 -0.03 -0.21 0.23 -0.02 0.11
## MR5 -0.03 1.00 0.18 -0.26 0.24 0.03
## MR3 -0.21 0.18 1.00 -0.17 0.22 0.00
## MR1 0.23 -0.26 -0.17 1.00 -0.18 -0.05
## MR4 -0.02 0.24 0.22 -0.18 1.00 0.03
## MR6 0.11 0.03 0.00 -0.05 0.03 1.00
##
## Mean item complexity = 1.8
## Test of the hypothesis that 6 factors are sufficient.
##
## The degrees of freedom for the null model are 378 and the objective function was 7.67 with Chi Square of 10648.54
## The degrees of freedom for the model are 225 and the objective function was 0.6
##
## The root mean square of the residuals (RMSR) is 0.02
## The df corrected root mean square of the residuals is 0.03
##
## The harmonic number of observations is 1375 with the empirical chi square 624.42 with prob < 2.9e-39
## The total number of observations was 1400 with Likelihood Chi Square = 836.09 with prob < 3.8e-71
##
## Tucker Lewis Index of factoring reliability = 0.9
## RMSEA index = 0.044 and the 90 % confidence intervals are 0.041 0.047
## BIC = -793.86
## Fit based upon off diagonal values = 0.98
## Measures of factor score adequacy
## MR2 MR5 MR3 MR1 MR4 MR6
## Correlation of (regression) scores with factors 0.93 0.89 0.88 0.89 0.86 0.75
## Multiple R square of scores with factors 0.86 0.78 0.78 0.78 0.75 0.56
## Minimum correlation of possible factor scores 0.72 0.57 0.56 0.57 0.50 0.12
# Run the EFA with six factors (as indicated by your scree plot)
EFA_model <- fa(bfi_EFA, nfactors=6)
# View items' factor loadings
EFA_model$loadings
##
## Loadings:
## MR2 MR5 MR3 MR1 MR4 MR6
## A1 -0.391 0.510
## A2 0.600 -0.192
## A3 0.651
## A4 0.440 0.187 -0.124
## A5 -0.166 0.588 -0.121 0.100
## C1 0.542 0.115 0.198
## C2 0.119 0.591 0.209 0.112 0.170
## C3 0.105 0.539 0.124
## C4 -0.687 0.107 0.191
## C5 0.107 -0.573 0.193 0.102
## E1 -0.125 -0.139 0.576
## E2 0.692
## E3 0.260 -0.296 0.408 0.126
## E4 0.376 -0.479 0.255
## E5 0.151 0.241 -0.354 0.309
## N1 0.803 -0.101
## N2 0.829
## N3 0.695 0.124
## N4 0.434 -0.148 0.382
## N5 0.532 0.216 0.208 -0.152
## O1 0.578
## O2 0.132 0.189 -0.382 0.242
## O3 0.674
## O4 0.108 0.171 0.386 0.353
## O5 0.103 -0.463 0.281
## gender 0.240 0.232 0.143 -0.109 -0.173 -0.168
## education -0.210
## age 0.103 -0.108 -0.276
##
## MR2 MR5 MR3 MR1 MR4 MR6
## SS loadings 2.490 1.965 1.919 1.796 1.715 0.777
## Proportion Var 0.089 0.070 0.069 0.064 0.061 0.028
## Cumulative Var 0.089 0.159 0.228 0.292 0.353 0.381
# View the first few lines of examinees' factor scores
head(EFA_model$scores)
## MR2 MR5 MR3 MR1 MR4 MR6
## 62551 NA NA NA NA NA NA
## 67093 -0.7724195 0.5611340 -0.98862424 -0.35665187 0.4818819 1.16616564
## 62162 -1.1613038 0.7091861 0.01602322 0.25442770 -0.3954765 0.61004704
## 61896 -0.1630297 0.3951015 0.20243968 -0.88470545 0.1207347 0.04158926
## 67438 0.1715825 -1.5105203 -0.13152183 0.09266676 -0.3904054 1.04839765
## 63328 1.3551168 0.5068694 1.21405963 -0.52823218 0.7409618 0.74609206
# Run each theorized EFA on your dataset
bfi_theory <- fa(bfi_EFA, nfactors = 5)
bfi_eigen <- fa(bfi_EFA, nfactors = 6)
# Compare the BIC values
bfi_theory$BIC
## [1] -511.141
bfi_eigen$BIC
## [1] -793.8647
Chapter 3 - Confirmatory Factor Analysis
Setting up CFA:
Understanding the sem() syntax:
Investigating model fit:
Example code includes:
# Conduct a five-factor EFA on the EFA half of the dataset
EFA_model <- fa(bfi_EFA, nfactors = 5)
# Use the wrapper function to create syntax for use with the sem() function
EFA_syn <- structure.sem(EFA_model)
# Set up syntax specifying which items load onto each factor
theory_syn_eq <- "
AGE: A1, A2, A3, A4, A5
CON: C1, C2, C3, C4, C5
EXT: E1, E2, E3, E4, E5
NEU: N1, N2, N3, N4, N5
OPE: O1, O2, O3, O4, O5
"
library(sem)
##
## Attaching package: 'sem'
## The following objects are masked from 'package:lavaan':
##
## cfa, sem
# Feed the syntax in to have variances and covariances automatically added
theory_syn <- cfa(text = theory_syn_eq, reference.indicators = FALSE)
## NOTE: adding 25 variances to the model
# Use the sem() function to run a CFA
theory_CFA <- sem(theory_syn, data = bfi_CFA)
## Warning in sem.semmod(theory_syn, data = bfi_CFA): -289 observations removed due
## to missingness
## Warning in sem.semmod(theory_syn, data = bfi_CFA): The following observed variables are in the input covariance or raw-moment matrix but do not appear in the model:
## gender, education, age
# Use the summary function to view fit information and parameter estimates
summary(theory_CFA)
##
## Model Chisquare = 2005.016 Df = 265 Pr(>Chisq) = 2.150816e-264
## AIC = 2125.016
## BIC = 146.5663
##
## Normalized Residuals
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.8135 -0.5790 0.7005 0.9165 2.3369 9.1189
##
## R-square for Endogenous Variables
## A1 A2 A3 A4 A5 C1 C2 C3 C4 C5 E1
## 0.1177 0.4221 0.5726 0.2652 0.4584 0.3015 0.3895 0.2854 0.4763 0.3806 0.3100
## E2 E3 E4 E5 N1 N2 N3 N4 N5 O1 O2
## 0.4955 0.4090 0.5185 0.3099 0.6809 0.6209 0.5142 0.3434 0.2209 0.3230 0.2294
## O3 O4 O5
## 0.4473 0.0615 0.2625
##
## Parameter Estimates
## Estimate Std Error z value Pr(>|z|)
## lam[A1:AGE] -0.47310342 0.04469539 -10.585060 3.495553e-26 A1 <--- AGE
## lam[A2:AGE] 0.74682312 0.03424190 21.810210 1.856211e-105 A2 <--- AGE
## lam[A3:AGE] 0.97246742 0.03690975 26.347171 5.530212e-153 A3 <--- AGE
## lam[A4:AGE] 0.73031369 0.04413591 16.546928 1.684830e-61 A4 <--- AGE
## lam[A5:AGE] 0.84567717 0.03685897 22.943592 1.707357e-116 A5 <--- AGE
## lam[C1:CON] -0.67102860 0.03893437 -17.234866 1.453902e-66 C1 <--- CON
## lam[C2:CON] -0.81745260 0.04090714 -19.983130 7.722751e-89 C2 <--- CON
## lam[C3:CON] -0.67484497 0.04040327 -16.702731 1.252010e-62 C3 <--- CON
## lam[C4:CON] 0.94244534 0.04196174 22.459632 1.030377e-111 C4 <--- CON
## lam[C5:CON] 1.00338850 0.05089376 19.715355 1.591951e-86 C5 <--- CON
## lam[E1:EXT] 0.88897886 0.04856677 18.304263 7.652070e-75 E1 <--- EXT
## lam[E2:EXT] 1.12282478 0.04596055 24.430185 8.174273e-132 E2 <--- EXT
## lam[E3:EXT] -0.86265804 0.03985228 -21.646390 6.573210e-104 E3 <--- EXT
## lam[E4:EXT] -1.04778146 0.04165946 -25.151108 1.374315e-139 E4 <--- EXT
## lam[E5:EXT] -0.73878674 0.04037337 -18.298864 8.449171e-75 E5 <--- EXT
## lam[N1:NEU] 1.28350486 0.04147064 30.949724 2.562431e-210 N1 <--- NEU
## lam[N2:NEU] 1.20126201 0.04131090 29.078570 6.700813e-186 N2 <--- NEU
## lam[N3:NEU] 1.15489837 0.04498842 25.671017 2.463597e-145 N3 <--- NEU
## lam[N4:NEU] 0.92407395 0.04646996 19.885404 5.444165e-88 N4 <--- NEU
## lam[N5:NEU] 0.76200858 0.04958272 15.368429 2.665543e-53 N5 <--- NEU
## lam[O1:OPE] 0.62901029 0.03848638 16.343713 4.822431e-60 O1 <--- OPE
## lam[O2:OPE] -0.75677801 0.05522087 -13.704565 9.533735e-43 O2 <--- OPE
## lam[O3:OPE] 0.79299695 0.04156934 19.076487 3.959980e-81 O3 <--- OPE
## lam[O4:OPE] 0.30437169 0.04406913 6.906687 4.961028e-12 O4 <--- OPE
## lam[O5:OPE] -0.68830815 0.04680655 -14.705379 5.953631e-49 O5 <--- OPE
## C[AGE,CON] -0.33903475 0.03616712 -9.374116 6.975805e-21 CON <--> AGE
## C[AGE,EXT] -0.71675158 0.02419566 -29.623150 7.522744e-193 EXT <--> AGE
## C[AGE,NEU] -0.23767064 0.03540025 -6.713812 1.896045e-11 NEU <--> AGE
## C[AGE,OPE] 0.23285485 0.04030234 5.777700 7.572854e-09 OPE <--> AGE
## C[CON,EXT] 0.38058118 0.03501044 10.870507 1.593130e-27 EXT <--> CON
## C[CON,NEU] 0.25969556 0.03567111 7.280277 3.331352e-13 NEU <--> CON
## C[CON,OPE] -0.28622586 0.04021418 -7.117536 1.098734e-12 OPE <--> CON
## C[EXT,NEU] 0.27605059 0.03454400 7.991276 1.335494e-15 NEU <--> EXT
## C[EXT,OPE] -0.34443896 0.03822892 -9.009905 2.062345e-19 OPE <--> EXT
## C[NEU,OPE] -0.09901255 0.03959699 -2.500507 1.240156e-02 OPE <--> NEU
## V[A1] 1.67705932 0.07357444 22.794048 5.252890e-115 A1 <--> A1
## V[A2] 0.76354863 0.03963774 19.263174 1.094593e-82 A2 <--> A2
## V[A3] 0.70576489 0.04510292 15.647875 3.435002e-55 A3 <--> A3
## V[A4] 1.47810259 0.06883999 21.471570 2.871544e-102 A4 <--> A4
## V[A5] 0.84511989 0.04553868 18.558285 6.990034e-77 A5 <--> A5
## V[C1] 1.04294084 0.05130818 20.326989 7.421893e-92 C1 <--> C1
## V[C2] 1.04727939 0.05588028 18.741486 2.271949e-78 C2 <--> C2
## V[C3] 1.14007575 0.05541605 20.573025 4.788395e-94 C3 <--> C3
## V[C4] 0.97670448 0.05854026 16.684321 1.704330e-62 C4 <--> C4
## V[C5] 1.63858678 0.08658845 18.923849 7.255663e-80 C5 <--> C5
## V[E1] 1.75863096 0.08303725 21.178820 1.497351e-99 E1 <--> E1
## V[E2] 1.28369574 0.07018375 18.290498 9.851190e-75 E2 <--> E2
## V[E3] 1.07543181 0.05412632 19.868927 7.560203e-88 E3 <--> E3
## V[E4] 1.01968201 0.05734980 17.780043 1.009021e-70 E4 <--> E4
## V[E5] 1.21546936 0.05738608 21.180563 1.442987e-99 E5 <--> E5
## V[N1] 0.77202708 0.05416344 14.253656 4.254140e-46 N1 <--> N1
## V[N2] 0.88100921 0.05399874 16.315365 7.674585e-60 N2 <--> N2
## V[N3] 1.25987596 0.06631127 18.999423 1.724296e-80 N3 <--> N3
## V[N4] 1.63271952 0.07642767 21.362938 2.956188e-101 N4 <--> N4
## V[N5] 2.04833227 0.09152194 22.380779 6.057690e-111 N5 <--> N5
## V[O1] 0.82913181 0.04602848 18.013452 1.527947e-72 O1 <--> O1
## V[O2] 1.92399378 0.09536054 20.175996 1.591273e-90 O2 <--> O2
## V[O3] 0.77711574 0.05441673 14.280824 2.881740e-46 O3 <--> O3
## V[O4] 1.41350151 0.06190555 22.833195 2.146796e-115 O4 <--> O4
## V[O5] 1.33126454 0.06832159 19.485268 1.464112e-84 O5 <--> O5
##
## Iterations = 26
# CAUTION THAT THIS WILL SET GLOBAL OPTIONS
# Set the options to include various fit indices so they will print
origFit <- getOption("fit.indices")
options(fit.indices = c("CFI", "GFI", "RMSEA", "BIC"))
# Use the summary function to view fit information and parameter estimates
summary(theory_CFA)
##
## Model Chisquare = 2005.016 Df = 265 Pr(>Chisq) = 2.150816e-264
## Goodness-of-fit index = 0.8594733
## RMSEA index = 0.07691165 90% CI: (NA, NA)
## Bentler CFI = 0.7863506
## BIC = 146.5663
##
## Normalized Residuals
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.8135 -0.5790 0.7005 0.9165 2.3369 9.1189
##
## R-square for Endogenous Variables
## A1 A2 A3 A4 A5 C1 C2 C3 C4 C5 E1
## 0.1177 0.4221 0.5726 0.2652 0.4584 0.3015 0.3895 0.2854 0.4763 0.3806 0.3100
## E2 E3 E4 E5 N1 N2 N3 N4 N5 O1 O2
## 0.4955 0.4090 0.5185 0.3099 0.6809 0.6209 0.5142 0.3434 0.2209 0.3230 0.2294
## O3 O4 O5
## 0.4473 0.0615 0.2625
##
## Parameter Estimates
## Estimate Std Error z value Pr(>|z|)
## lam[A1:AGE] -0.47310342 0.04469539 -10.585060 3.495553e-26 A1 <--- AGE
## lam[A2:AGE] 0.74682312 0.03424190 21.810210 1.856211e-105 A2 <--- AGE
## lam[A3:AGE] 0.97246742 0.03690975 26.347171 5.530212e-153 A3 <--- AGE
## lam[A4:AGE] 0.73031369 0.04413591 16.546928 1.684830e-61 A4 <--- AGE
## lam[A5:AGE] 0.84567717 0.03685897 22.943592 1.707357e-116 A5 <--- AGE
## lam[C1:CON] -0.67102860 0.03893437 -17.234866 1.453902e-66 C1 <--- CON
## lam[C2:CON] -0.81745260 0.04090714 -19.983130 7.722751e-89 C2 <--- CON
## lam[C3:CON] -0.67484497 0.04040327 -16.702731 1.252010e-62 C3 <--- CON
## lam[C4:CON] 0.94244534 0.04196174 22.459632 1.030377e-111 C4 <--- CON
## lam[C5:CON] 1.00338850 0.05089376 19.715355 1.591951e-86 C5 <--- CON
## lam[E1:EXT] 0.88897886 0.04856677 18.304263 7.652070e-75 E1 <--- EXT
## lam[E2:EXT] 1.12282478 0.04596055 24.430185 8.174273e-132 E2 <--- EXT
## lam[E3:EXT] -0.86265804 0.03985228 -21.646390 6.573210e-104 E3 <--- EXT
## lam[E4:EXT] -1.04778146 0.04165946 -25.151108 1.374315e-139 E4 <--- EXT
## lam[E5:EXT] -0.73878674 0.04037337 -18.298864 8.449171e-75 E5 <--- EXT
## lam[N1:NEU] 1.28350486 0.04147064 30.949724 2.562431e-210 N1 <--- NEU
## lam[N2:NEU] 1.20126201 0.04131090 29.078570 6.700813e-186 N2 <--- NEU
## lam[N3:NEU] 1.15489837 0.04498842 25.671017 2.463597e-145 N3 <--- NEU
## lam[N4:NEU] 0.92407395 0.04646996 19.885404 5.444165e-88 N4 <--- NEU
## lam[N5:NEU] 0.76200858 0.04958272 15.368429 2.665543e-53 N5 <--- NEU
## lam[O1:OPE] 0.62901029 0.03848638 16.343713 4.822431e-60 O1 <--- OPE
## lam[O2:OPE] -0.75677801 0.05522087 -13.704565 9.533735e-43 O2 <--- OPE
## lam[O3:OPE] 0.79299695 0.04156934 19.076487 3.959980e-81 O3 <--- OPE
## lam[O4:OPE] 0.30437169 0.04406913 6.906687 4.961028e-12 O4 <--- OPE
## lam[O5:OPE] -0.68830815 0.04680655 -14.705379 5.953631e-49 O5 <--- OPE
## C[AGE,CON] -0.33903475 0.03616712 -9.374116 6.975805e-21 CON <--> AGE
## C[AGE,EXT] -0.71675158 0.02419566 -29.623150 7.522744e-193 EXT <--> AGE
## C[AGE,NEU] -0.23767064 0.03540025 -6.713812 1.896045e-11 NEU <--> AGE
## C[AGE,OPE] 0.23285485 0.04030234 5.777700 7.572854e-09 OPE <--> AGE
## C[CON,EXT] 0.38058118 0.03501044 10.870507 1.593130e-27 EXT <--> CON
## C[CON,NEU] 0.25969556 0.03567111 7.280277 3.331352e-13 NEU <--> CON
## C[CON,OPE] -0.28622586 0.04021418 -7.117536 1.098734e-12 OPE <--> CON
## C[EXT,NEU] 0.27605059 0.03454400 7.991276 1.335494e-15 NEU <--> EXT
## C[EXT,OPE] -0.34443896 0.03822892 -9.009905 2.062345e-19 OPE <--> EXT
## C[NEU,OPE] -0.09901255 0.03959699 -2.500507 1.240156e-02 OPE <--> NEU
## V[A1] 1.67705932 0.07357444 22.794048 5.252890e-115 A1 <--> A1
## V[A2] 0.76354863 0.03963774 19.263174 1.094593e-82 A2 <--> A2
## V[A3] 0.70576489 0.04510292 15.647875 3.435002e-55 A3 <--> A3
## V[A4] 1.47810259 0.06883999 21.471570 2.871544e-102 A4 <--> A4
## V[A5] 0.84511989 0.04553868 18.558285 6.990034e-77 A5 <--> A5
## V[C1] 1.04294084 0.05130818 20.326989 7.421893e-92 C1 <--> C1
## V[C2] 1.04727939 0.05588028 18.741486 2.271949e-78 C2 <--> C2
## V[C3] 1.14007575 0.05541605 20.573025 4.788395e-94 C3 <--> C3
## V[C4] 0.97670448 0.05854026 16.684321 1.704330e-62 C4 <--> C4
## V[C5] 1.63858678 0.08658845 18.923849 7.255663e-80 C5 <--> C5
## V[E1] 1.75863096 0.08303725 21.178820 1.497351e-99 E1 <--> E1
## V[E2] 1.28369574 0.07018375 18.290498 9.851190e-75 E2 <--> E2
## V[E3] 1.07543181 0.05412632 19.868927 7.560203e-88 E3 <--> E3
## V[E4] 1.01968201 0.05734980 17.780043 1.009021e-70 E4 <--> E4
## V[E5] 1.21546936 0.05738608 21.180563 1.442987e-99 E5 <--> E5
## V[N1] 0.77202708 0.05416344 14.253656 4.254140e-46 N1 <--> N1
## V[N2] 0.88100921 0.05399874 16.315365 7.674585e-60 N2 <--> N2
## V[N3] 1.25987596 0.06631127 18.999423 1.724296e-80 N3 <--> N3
## V[N4] 1.63271952 0.07642767 21.362938 2.956188e-101 N4 <--> N4
## V[N5] 2.04833227 0.09152194 22.380779 6.057690e-111 N5 <--> N5
## V[O1] 0.82913181 0.04602848 18.013452 1.527947e-72 O1 <--> O1
## V[O2] 1.92399378 0.09536054 20.175996 1.591273e-90 O2 <--> O2
## V[O3] 0.77711574 0.05441673 14.280824 2.881740e-46 O3 <--> O3
## V[O4] 1.41350151 0.06190555 22.833195 2.146796e-115 O4 <--> O4
## V[O5] 1.33126454 0.06832159 19.485268 1.464112e-84 O5 <--> O5
##
## Iterations = 26
# Run a CFA using the EFA syntax you created earlier
EFA_CFA <- sem(EFA_syn, data = bfi_CFA)
## Warning in sem.semmod(EFA_syn, data = bfi_CFA): -289 observations removed due to
## missingness
# Locate the BIC in the fit statistics of the summary output
summary(EFA_CFA)$BIC
## [1] 480.1274
# Compare EFA_CFA BIC to the BIC from the CFA based on theory
summary(theory_CFA)$BIC
## [1] 146.5663
# Reset to baseline
options(fit.indices = origFit)
Chapter 4 - Refining Your Measure and Model
EFA vs CFA Revisited:
Adding Loadings to Improve Fit:
Improving Fit by Removing Loadings:
Wrap-Up:
Example code includes:
# CAUTION THAT THIS WILL SET GLOBAL OPTIONS
# Set the options to include various fit indices so they will print
origFit <- getOption("fit.indices")
options(fit.indices = c("CFI", "GFI", "RMSEA", "BIC"))
# View the first five rows of the EFA loadings
EFA_model$loadings[1:5, ]
## MR2 MR3 MR1 MR5 MR4
## A1 0.194806449 0.08286748 -0.150193662 -0.4264281 -0.01822345
## A2 -0.007392792 0.06744147 -0.009306157 0.6261111 0.04454954
## A3 -0.028215628 0.02767518 -0.105034038 0.6441602 0.02841403
## A4 -0.049129621 0.18695432 -0.091956712 0.4276816 -0.14686861
## A5 -0.123184595 0.01709411 -0.191147915 0.5553174 0.06932404
# View the first five loadings from the CFA estimated from the EFA results
summary(EFA_CFA)$coeff[1:5, ]
## Estimate Std Error z value Pr(>|z|)
## F4A1 -0.5184257 0.04528396 -11.44833 2.397187e-30 A1 <--- MR5
## F4A2 0.7768131 0.03524249 22.04195 1.141266e-107 A2 <--- MR5
## F4A3 0.9968771 0.03885030 25.65944 3.317365e-145 A3 <--- MR5
## F4A4 0.7235088 0.04521136 16.00281 1.221362e-57 A4 <--- MR5
## F4A5 0.7768296 0.03870116 20.07251 1.283512e-89 A5 <--- MR5
# Extracting factor scores from the EFA model
EFA_scores <- EFA_model$scores
# Calculating factor scores by applying the CFA parameters to the EFA dataset
CFA_scores <- fscores(EFA_CFA, data = bfi_EFA)
# Comparing factor scores from the EFA and CFA results from the bfi_EFA dataset
plot(density(EFA_scores[,1], na.rm = TRUE),
xlim = c(-3, 3), ylim = c(0, 1), col = "blue")
lines(density(CFA_scores[,1], na.rm = TRUE),
xlim = c(-3, 3), ylim = c(0, 1), col = "red")
# Add some plausible item/factor loadings to the syntax
theory_syn_add <- "
AGE: A1, A2, A3, A4, A5
CON: C1, C2, C3, C4, C5
EXT: E1, E2, E3, E4, E5, N4
NEU: N1, N2, N3, N4, N5, E3
OPE: O1, O2, O3, O4, O5
"
# Convert your equations to sem-compatible syntax
theory_syn2 <- cfa(text = theory_syn_add, reference.indicators = FALSE)
## NOTE: adding 25 variances to the model
# Run a CFA with the revised syntax
theory_CFA_add <- sem(model = theory_syn2, data = bfi_CFA)
## Warning in sem.semmod(model = theory_syn2, data = bfi_CFA): -289 observations
## removed due to missingness
## Warning in sem.semmod(model = theory_syn2, data = bfi_CFA): The following observed variables are in the input covariance or raw-moment matrix but do not appear in the model:
## gender, education, age
# Conduct a likelihood ratio test
anova(theory_CFA, theory_CFA_add)
## LR Test for Difference Between Models
##
## Model Df Model Chisq Df LR Chisq Pr(>Chisq)
## theory_CFA 265 2005.0
## theory_CFA_add 263 1901.8 2 103.19 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Compare the comparative fit indices - higher is better!
summary(theory_CFA)$CFI
## [1] 0.7863506
summary(theory_CFA_add)$CFI
## [1] 0.7987748
# Compare the RMSEA values - lower is better!
summary(theory_CFA)$RMSEA
## [1] 0.07691165 NA NA 0.90000000
summary(theory_CFA_add)$RMSEA
## [1] 0.07492514 NA NA 0.90000000
# Compare BIC values
summary(theory_CFA)$BIC
## [1] 146.5663
summary(theory_CFA_add)$BIC
## [1] 57.40664
# Remove the weakest factor loading from the syntax
theory_syn_del <- "
AGE: A1, A2, A3, A4, A5
CON: C1, C2, C3, C4, C5
EXT: E1, E2, E3, E4, E5
NEU: N1, N2, N3, N4, N5
OPE: O1, O2, O3, O5
"
# Convert your equations to sem-compatible syntax
theory_syn3 <- cfa(text = theory_syn_del, reference.indicators = FALSE)
## NOTE: adding 24 variances to the model
# Run a CFA with the revised syntax
theory_CFA_del <- sem(model = theory_syn3, data = bfi_CFA)
## Warning in sem.semmod(model = theory_syn3, data = bfi_CFA): -289 observations
## removed due to missingness
## Warning in sem.semmod(model = theory_syn3, data = bfi_CFA): The following observed variables are in the input covariance or raw-moment matrix but do not appear in the model:
## O4, gender, education, age
# Compare the comparative fit indices - higher is better!
summary(theory_CFA)$CFI
## [1] 0.7863506
summary(theory_CFA_del)$CFI
## [1] 0.7983846
# Compare the RMSEA values - lower is better!
summary(theory_CFA)$RMSEA
## [1] 0.07691165 NA NA 0.90000000
summary(theory_CFA_del)$RMSEA
## [1] 0.07732379 NA NA 0.90000000
# Compare BIC values
summary(theory_CFA)$BIC
## [1] 146.5663
summary(theory_CFA_del)$BIC
## [1] 150.9206
# Reset to baseline
options(fit.indices = origFit)
Chapter 1 - GLM - Extension of Regression Toolbox
Limitations of linear models:
Poisson regression:
Basic lm() functions with glm():
Example code includes:
data(ChickWeight, package="datasets")
ChickWeightEnd <- ChickWeight %>%
mutate(Chick=as.factor(as.integer(Chick))) %>%
group_by(Chick) %>%
filter(Time==max(Time), !(Chick %in% c(1, 2, 3, 8, 41))) %>%
ungroup()
glimpse(ChickWeightEnd)
## Observations: 45
## Variables: 4
## $ weight <dbl> 205, 215, 202, 157, 223, 157, 305, 98, 124, 175, 205, 96, 26...
## $ Time <dbl> 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, ...
## $ Chick <fct> 15, 17, 14, 11, 18, 12, 20, 5, 7, 13, 16, 4, 19, 9, 10, 6, 3...
## $ Diet <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, ...
# Fit a lm()
lm(formula = weight ~ Diet, data = ChickWeightEnd)
##
## Call:
## lm(formula = weight ~ Diet, data = ChickWeightEnd)
##
## Coefficients:
## (Intercept) Diet2 Diet3 Diet4
## 177.75 36.95 92.55 60.81
# Fit a glm()
glm( formula = weight ~ Diet , data = ChickWeightEnd, family = 'gaussian')
##
## Call: glm(formula = weight ~ Diet, family = "gaussian", data = ChickWeightEnd)
##
## Coefficients:
## (Intercept) Diet2 Diet3 Diet4
## 177.75 36.95 92.55 60.81
##
## Degrees of Freedom: 44 Total (i.e. Null); 41 Residual
## Null Deviance: 225000
## Residual Deviance: 167800 AIC: 507.8
dat <- data.frame(time=1:30,
count=c(0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 2, 0, 1, 0, 0, 1, 0, 0, 0, 2, 2, 1, 1, 4, 1, 1, 1, 1, 0, 0)
)
dat
## time count
## 1 1 0
## 2 2 0
## 3 3 0
## 4 4 0
## 5 5 1
## 6 6 0
## 7 7 0
## 8 8 1
## 9 9 0
## 10 10 0
## 11 11 2
## 12 12 0
## 13 13 1
## 14 14 0
## 15 15 0
## 16 16 1
## 17 17 0
## 18 18 0
## 19 19 0
## 20 20 2
## 21 21 2
## 22 22 1
## 23 23 1
## 24 24 4
## 25 25 1
## 26 26 1
## 27 27 1
## 28 28 1
## 29 29 0
## 30 30 0
# fit y predicted by x with data.frame dat using the poisson family
poissonOut <- glm(count ~ time, data=dat, family="poisson")
# print the output
print(poissonOut)
##
## Call: glm(formula = count ~ time, family = "poisson", data = dat)
##
## Coefficients:
## (Intercept) time
## -1.43036 0.05815
##
## Degrees of Freedom: 29 Total (i.e. Null); 28 Residual
## Null Deviance: 35.63
## Residual Deviance: 30.92 AIC: 66.02
# Fit a glm with count predicted by time using data.frame dat and gaussian family
lmOut <- glm(count ~ time, data=dat, family="gaussian")
summary(lmOut)
##
## Call:
## glm(formula = count ~ time, family = "gaussian", data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2022 -0.5190 -0.1497 0.2595 3.0194
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.09425 0.32891 0.287 0.7766
## time 0.03693 0.01853 1.993 0.0561 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.7714815)
##
## Null deviance: 24.667 on 29 degrees of freedom
## Residual deviance: 21.601 on 28 degrees of freedom
## AIC: 81.283
##
## Number of Fisher Scoring iterations: 2
summary(poissonOut)
##
## Call:
## glm(formula = count ~ time, family = "poisson", data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6547 -0.9666 -0.7226 0.3830 2.3022
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.43036 0.59004 -2.424 0.0153 *
## time 0.05815 0.02779 2.093 0.0364 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 35.627 on 29 degrees of freedom
## Residual deviance: 30.918 on 28 degrees of freedom
## AIC: 66.024
##
## Number of Fisher Scoring iterations: 5
scores <- data.frame(player=rep(c("Sam", "Lou"), each=5),
goal=c(1, 2, 0, 4, 3, 0, 0, 1, 0, 0)
)
scores
## player goal
## 1 Sam 1
## 2 Sam 2
## 3 Sam 0
## 4 Sam 4
## 5 Sam 3
## 6 Lou 0
## 7 Lou 0
## 8 Lou 1
## 9 Lou 0
## 10 Lou 0
# Fit a glm() that estimates the difference between players
summary(glm(goal ~ player, data=scores, family="poisson"))
##
## Call:
## glm(formula = goal ~ player, family = "poisson", data = scores)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0000 -0.6325 -0.6325 0.4934 1.2724
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6094 0.9999 -1.610 0.1075
## playerSam 2.3026 1.0487 2.196 0.0281 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 18.3578 on 9 degrees of freedom
## Residual deviance: 9.8105 on 8 degrees of freedom
## AIC: 26.682
##
## Number of Fisher Scoring iterations: 5
# Fit a glm() that estimates an intercept for each player
summary(glm(goal ~ player - 1, data=scores, family="poisson"))
##
## Call:
## glm(formula = goal ~ player - 1, family = "poisson", data = scores)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0000 -0.6325 -0.6325 0.4934 1.2724
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## playerLou -1.6094 0.9999 -1.610 0.1075
## playerSam 0.6931 0.3162 2.192 0.0284 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 18.4546 on 10 degrees of freedom
## Residual deviance: 9.8105 on 8 degrees of freedom
## AIC: 26.682
##
## Number of Fisher Scoring iterations: 5
dat2 <- data.frame(Date=as.Date("2005-01-09")+1:4368, Number=0L) %>%
mutate(Month=as.factor(lubridate::month(Date)))
eq1 <- c(1, 2, 6, 22, 42, 47, 48, 86, 96, 109, 113, 119, 190, 192, 208, 248, 264, 278, 306, 333, 334, 336, 368, 375, 392, 393, 408, 417, 424, 429, 439, 449, 455, 456, 500, 523, 536, 544, 545, 548, 550, 551, 586, 590, 597, 598, 673, 678, 700, 717, 740, 750, 755, 756, 767, 775, 793, 831, 859, 865, 866, 877, 885, 887, 895, 937, 1086, 1101, 1107, 1111, 1112, 1154, 1157, 1183, 1213, 1235, 1247, 1251, 1269, 1272, 1288, 1295, 1300, 1320, 1342, 1350, 1424, 1454, 1457, 1460, 1476, 1522, 1589, 1598, 1608, 1627, 1642, 1665, 1697, 1709, 1733, 1746, 1749, 1766, 1799, 1830, 1866, 1895, 1914, 1920, 1934, 1942, 1953, 1960, 1961, 1966, 1969, 1989, 2007, 2041, 2051, 2087, 2092, 2096, 2106, 2122, 2129, 2138, 2156, 2159, 2174, 2176, 2177, 2180, 2191, 2214, 2217, 2218, 2251, 2276, 2286, 2302, 2308, 2340, 2352, 2361, 2382, 2416, 2419, 2421, 2464, 2468, 2492, 2522, 2526, 2548, 2550, 2573, 2620, 2625, 2627, 2629, 2698, 2706, 2721, 2726, 2760, 2768, 2787, 2796, 2813, 2854, 2858, 2890, 2900, 2909, 2932, 2933, 2955, 2960, 2966, 2997, 3032, 3057, 3063, 3080, 3090, 3095, 3098, 3122, 3130, 3154, 3160, 3199, 3205, 3215, 3227, 3229, 3243, 3244, 3254, 3302, 3340, 3350, 3469, 3506, 3519, 3525, 3535, 3542, 3584, 3604, 3653, 3660, 3673, 3692, 3694, 3706, 3763, 3792, 3801, 3808, 3812, 3814, 3822, 3884, 3892, 4001, 4084, 4194, 4210, 4220, 4229, 4242, 4265, 4267, 4296, 4302, 4325, 4334, 4338, 4341, 4353, 4354, 4357, 4368)
eq2 <- c(21, 195, 308, 505, 522, 560, 913, 1202, 1353, 1439, 1473, 1484, 1614, 1717, 1808, 1940, 2110, 2391, 2407, 2535, 2716, 2748, 2949, 3313, 3421, 3671, 3967, 3991, 4281)
eq3 <- c(624, 776, 1364, 1585, 2063, 2109, 2196, 2569, 2576, 2607, 3399, 3533, 3607)
eq4 <- c(463, 1918, 2417, 3064, 3606)
eq5 <- c(13, 3826)
eq6 <- c(701, 2097)
eq7 <- c(2509, 4276)
eq9 <- c(1637)
dat2[eq1, "Number"] <- 1L
dat2[eq2, "Number"] <- 2L
dat2[eq3, "Number"] <- 3L
dat2[eq4, "Number"] <- 4L
dat2[eq5, "Number"] <- 5L
dat2[eq6, "Number"] <- 6L
dat2[eq7, "Number"] <- 7L
dat2[eq9, "Number"] <- 9L
str(dat2)
## 'data.frame': 4368 obs. of 3 variables:
## $ Date : Date, format: "2005-01-10" "2005-01-11" ...
## $ Number: int 1 1 0 0 0 1 0 0 0 0 ...
## $ Month : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
table(dat2$Number)
##
## 0 1 2 3 4 5 6 7 9
## 4068 246 29 13 5 2 2 2 1
table(dat2$Month)
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 363 339 372 360 372 360 372 372 360 372 360 366
# build your models
lmOut <- lm(Number ~ Month, data=dat2)
poissonOut <- glm(Number ~ Month, data=dat2, family="poisson")
# examine the outputs using print
print(lmOut)
##
## Call:
## lm(formula = Number ~ Month, data = dat2)
##
## Coefficients:
## (Intercept) Month2 Month3 Month4 Month5 Month6
## 0.129477 -0.038031 -0.078401 -0.057254 -0.032702 -0.043365
## Month7 Month8 Month9 Month10 Month11 Month12
## -0.005821 -0.051520 -0.023921 -0.054208 -0.023921 -0.022919
print(poissonOut)
##
## Call: glm(formula = Number ~ Month, family = "poisson", data = dat2)
##
## Coefficients:
## (Intercept) Month2 Month3 Month4 Month5 Month6
## -2.0443 -0.3478 -0.9302 -0.5838 -0.2911 -0.4079
## Month7 Month8 Month9 Month10 Month11 Month12
## -0.0460 -0.5073 -0.2043 -0.5424 -0.2043 -0.1948
##
## Degrees of Freedom: 4367 Total (i.e. Null); 4356 Residual
## Null Deviance: 2325
## Residual Deviance: 2303 AIC: 2976
# examine the outputs using summary
summary(lmOut)
##
## Call:
## lm(formula = Number ~ Month, data = dat2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.1295 -0.1056 -0.0914 -0.0753 8.8763
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.129477 0.022770 5.686 1.38e-08 ***
## Month2 -0.038031 0.032767 -1.161 0.2458
## Month3 -0.078401 0.032007 -2.450 0.0143 *
## Month4 -0.057254 0.032269 -1.774 0.0761 .
## Month5 -0.032702 0.032007 -1.022 0.3070
## Month6 -0.043365 0.032269 -1.344 0.1791
## Month7 -0.005821 0.032007 -0.182 0.8557
## Month8 -0.051520 0.032007 -1.610 0.1075
## Month9 -0.023921 0.032269 -0.741 0.4586
## Month10 -0.054208 0.032007 -1.694 0.0904 .
## Month11 -0.023921 0.032269 -0.741 0.4586
## Month12 -0.022919 0.032136 -0.713 0.4758
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4338 on 4356 degrees of freedom
## Multiple R-squared: 0.00249, Adjusted R-squared: -2.927e-05
## F-statistic: 0.9884 on 11 and 4356 DF, p-value: 0.4542
summary(poissonOut)
##
## Call:
## glm(formula = Number ~ Month, family = "poisson", data = dat2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5089 -0.4595 -0.4277 -0.3880 7.7086
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.0443 0.1459 -14.015 < 2e-16 ***
## Month2 -0.3478 0.2314 -1.503 0.132839
## Month3 -0.9302 0.2719 -3.422 0.000623 ***
## Month4 -0.5837 0.2444 -2.388 0.016923 *
## Month5 -0.2911 0.2215 -1.314 0.188706
## Month6 -0.4079 0.2314 -1.763 0.077939 .
## Month7 -0.0460 0.2074 -0.222 0.824486
## Month8 -0.5073 0.2361 -2.149 0.031671 *
## Month9 -0.2043 0.2182 -0.936 0.349112
## Month10 -0.5424 0.2387 -2.272 0.023075 *
## Month11 -0.2043 0.2182 -0.936 0.349112
## Month12 -0.1948 0.2166 -0.899 0.368434
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2325.3 on 4367 degrees of freedom
## Residual deviance: 2302.7 on 4356 degrees of freedom
## AIC: 2975.6
##
## Number of Fisher Scoring iterations: 6
# examine the outputs using tidy
broom::tidy(lmOut)
## # A tibble: 12 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.129 0.0228 5.69 0.0000000138
## 2 Month2 -0.0380 0.0328 -1.16 0.246
## 3 Month3 -0.0784 0.0320 -2.45 0.0143
## 4 Month4 -0.0573 0.0323 -1.77 0.0761
## 5 Month5 -0.0327 0.0320 -1.02 0.307
## 6 Month6 -0.0434 0.0323 -1.34 0.179
## 7 Month7 -0.00582 0.0320 -0.182 0.856
## 8 Month8 -0.0515 0.0320 -1.61 0.108
## 9 Month9 -0.0239 0.0323 -0.741 0.459
## 10 Month10 -0.0542 0.0320 -1.69 0.0904
## 11 Month11 -0.0239 0.0323 -0.741 0.459
## 12 Month12 -0.0229 0.0321 -0.713 0.476
broom::tidy(poissonOut)
## # A tibble: 12 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -2.04 0.146 -14.0 1.27e-44
## 2 Month2 -0.348 0.231 -1.50 1.33e- 1
## 3 Month3 -0.930 0.272 -3.42 6.23e- 4
## 4 Month4 -0.584 0.244 -2.39 1.69e- 2
## 5 Month5 -0.291 0.221 -1.31 1.89e- 1
## 6 Month6 -0.408 0.231 -1.76 7.79e- 2
## 7 Month7 -0.0460 0.207 -0.222 8.24e- 1
## 8 Month8 -0.507 0.236 -2.15 3.17e- 2
## 9 Month9 -0.204 0.218 -0.936 3.49e- 1
## 10 Month10 -0.542 0.239 -2.27 2.31e- 2
## 11 Month11 -0.204 0.218 -0.936 3.49e- 1
## 12 Month12 -0.195 0.217 -0.899 3.68e- 1
# Extract the regression coefficients
coef(poissonOut)
## (Intercept) Month2 Month3 Month4 Month5 Month6
## -2.04425523 -0.34775767 -0.93019964 -0.58375226 -0.29111968 -0.40786159
## Month7 Month8 Month9 Month10 Month11 Month12
## -0.04599723 -0.50734279 -0.20426264 -0.54243411 -0.20426264 -0.19481645
# Extract the confidence intervals
confint(poissonOut)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -2.3444432 -1.77136313
## Month2 -0.8103027 0.10063404
## Month3 -1.4866061 -0.41424128
## Month4 -1.0762364 -0.11342457
## Month5 -0.7311289 0.14051326
## Month6 -0.8704066 0.04053012
## Month7 -0.4542037 0.36161360
## Month8 -0.9807831 -0.05092540
## Month9 -0.6367321 0.22171492
## Month10 -1.0218277 -0.08165226
## Month11 -0.6367321 0.22171492
## Month12 -0.6237730 0.22851779
# use the model to predict with new data
newDat <- data.frame(Month=as.factor(6:8))
predOut <- predict(object = poissonOut, newdata = newDat, type = "response")
# print the predictions
print(predOut)
## 1 2 3
## 0.08611111 0.12365591 0.07795699
Chapter 2 - Logistic Regression
Overview of logistic regression:
Bernoulli vs. Binomial Distribution:
Link functions - probit compared with logit:
Example code includes:
busData <- readr::read_csv("./RInputFiles/busData.csv")
## Parsed with column specification:
## cols(
## CommuteDays = col_double(),
## MilesOneWay = col_double(),
## Bus = col_character()
## )
bus <- busData %>%
mutate(Bus=factor(Bus, levels=c("No", "Yes")))
glimpse(bus)
## Observations: 15,892
## Variables: 3
## $ CommuteDays <dbl> 5, 5, 5, 5, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5...
## $ MilesOneWay <dbl> 19.54675, 19.54675, 19.54675, 19.54675, 19.54675, 21.66...
## $ Bus <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, No, No, No, N...
# Build a glm that models Bus predicted by CommuteDays
# using data.frame bus. Remember to use a binomial family.
busOut <- glm(Bus ~ CommuteDays, data=bus, family="binomial")
# Print the busOut (be sure to use the print() function)
print(busOut)
##
## Call: glm(formula = Bus ~ CommuteDays, family = "binomial", data = bus)
##
## Coefficients:
## (Intercept) CommuteDays
## -1.4549 0.1299
##
## Degrees of Freedom: 15891 Total (i.e. Null); 15890 Residual
## Null Deviance: 19570
## Residual Deviance: 19540 AIC: 19540
# Look at the summary() of busOut
summary(busOut)
##
## Call:
## glm(formula = Bus ~ CommuteDays, family = "binomial", data = bus)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9560 -0.8595 -0.8595 1.5330 1.7668
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.45493 0.11471 -12.683 < 2e-16 ***
## CommuteDays 0.12985 0.02312 5.616 1.96e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19568 on 15891 degrees of freedom
## Residual deviance: 19536 on 15890 degrees of freedom
## AIC: 19540
##
## Number of Fisher Scoring iterations: 4
# Look at the tidy() output of busOut
broom::tidy(busOut)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.45 0.115 -12.7 7.32e-37
## 2 CommuteDays 0.130 0.0231 5.62 1.96e- 8
# Simulate 1 draw with a sample size of 100
binomialSim <- rbinom(n=1, size=100, prob=0.5)
# Simulate 100 draw with a sample size of 1
BernoulliSim <- rbinom(n=100, size=1, prob=0.5)
# Print the results from the binomial
print(binomialSim)
## [1] 47
# Sum the results from the Bernoulli
sum(BernoulliSim)
## [1] 46
dataLong <- data.frame(x=factor(rep(c("a", "b"), each=14), levels=c("a", "b")),
y=factor(c('fail', 'fail', 'fail', 'fail', 'success', 'fail', 'fail', 'fail', 'fail', 'fail', 'fail', 'fail', 'fail', 'success', 'success', 'fail', 'success', 'success', 'success', 'success', 'success', 'success', 'success', 'success', 'success', 'fail', 'success', 'fail'), levels=c("fail", "success"))
)
str(dataLong)
## 'data.frame': 28 obs. of 2 variables:
## $ x: Factor w/ 2 levels "a","b": 1 1 1 1 1 1 1 1 1 1 ...
## $ y: Factor w/ 2 levels "fail","success": 1 1 1 1 2 1 1 1 1 1 ...
# Fit a a long format logistic regression
lr_1 <- glm(y ~ x, data=dataLong, family="binomial")
print(lr_1)
##
## Call: glm(formula = y ~ x, family = "binomial", data = dataLong)
##
## Coefficients:
## (Intercept) xb
## -1.792 3.091
##
## Degrees of Freedom: 27 Total (i.e. Null); 26 Residual
## Null Deviance: 38.67
## Residual Deviance: 26.03 AIC: 30.03
dataWide <- dataLong %>%
group_by(x) %>%
summarize(fail=sum(y=="fail"), success=sum(y=="success"), Total=n(), successProportion = success/Total)
dataWide
## # A tibble: 2 x 5
## x fail success Total successProportion
## <fct> <int> <int> <int> <dbl>
## 1 a 12 2 14 0.143
## 2 b 3 11 14 0.786
# Fit a wide form logistic regression
lr_2 <- glm(cbind(fail, success) ~ x, data=dataWide, family="binomial")
# Fit a a weighted form logistic regression
lr_3 <- glm(successProportion ~ x, weights=Total, data=dataWide, family="binomial")
# print your results
print(lr_2)
##
## Call: glm(formula = cbind(fail, success) ~ x, family = "binomial",
## data = dataWide)
##
## Coefficients:
## (Intercept) xb
## 1.792 -3.091
##
## Degrees of Freedom: 1 Total (i.e. Null); 0 Residual
## Null Deviance: 12.64
## Residual Deviance: -4.441e-16 AIC: 9.215
print(lr_3)
##
## Call: glm(formula = successProportion ~ x, family = "binomial", data = dataWide,
## weights = Total)
##
## Coefficients:
## (Intercept) xb
## -1.792 3.091
##
## Degrees of Freedom: 1 Total (i.e. Null); 0 Residual
## Null Deviance: 12.64
## Residual Deviance: 4.441e-15 AIC: 9.215
# Fit a GLM with a logit link and save it as busLogit
busLogit <- glm(Bus ~ CommuteDays, data = bus, family = binomial(link = "logit"))
# Fit a GLM with probit link and save it as busProbit
busProbit <- glm(Bus ~ CommuteDays, data = bus, family = binomial(link = "probit"))
# Print model summaries
summary(busLogit)
##
## Call:
## glm(formula = Bus ~ CommuteDays, family = binomial(link = "logit"),
## data = bus)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9560 -0.8595 -0.8595 1.5330 1.7668
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.45493 0.11471 -12.683 < 2e-16 ***
## CommuteDays 0.12985 0.02312 5.616 1.96e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19568 on 15891 degrees of freedom
## Residual deviance: 19536 on 15890 degrees of freedom
## AIC: 19540
##
## Number of Fisher Scoring iterations: 4
summary(busProbit)
##
## Call:
## glm(formula = Bus ~ CommuteDays, family = binomial(link = "probit"),
## data = bus)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9545 -0.8596 -0.8596 1.5328 1.7706
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.88951 0.06833 -13.017 < 2e-16 ***
## CommuteDays 0.07810 0.01380 5.658 1.53e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19568 on 15891 degrees of freedom
## Residual deviance: 19536 on 15890 degrees of freedom
## AIC: 19540
##
## Number of Fisher Scoring iterations: 4
# Convert from the logit scale to a probability
p <- dlogis(0)
# Simulate a logit
rbinom(n=10, size=1, prob=p)
## [1] 0 0 1 0 0 0 0 0 0 0
# Convert from the probit scale to a probability
p <- pnorm(0)
# Simulate a probit
rbinom(n=10, size=1, prob=p)
## [1] 0 0 0 1 0 0 1 0 1 1
Chapter 3 - Interpreting and Visualizing GLMs
Poisson Regression Coefficients:
Plotting Poisson Regression:
Understanding output from logistic regression:
ggplot2 and binomial regression:
Example code includes:
# extract the coeffients from lmOut
(lmCoef <- coef(lmOut))
## (Intercept) Month2 Month3 Month4 Month5 Month6
## 0.12947658 -0.03803116 -0.07840132 -0.05725436 -0.03270239 -0.04336547
## Month7 Month8 Month9 Month10 Month11 Month12
## -0.00582067 -0.05151959 -0.02392103 -0.05420777 -0.02392103 -0.02291921
# extract the coefficients from poisosnOut
(poissonCoef <- coef(poissonOut))
## (Intercept) Month2 Month3 Month4 Month5 Month6
## -2.04425523 -0.34775767 -0.93019964 -0.58375226 -0.29111968 -0.40786159
## Month7 Month8 Month9 Month10 Month11 Month12
## -0.04599723 -0.50734279 -0.20426264 -0.54243411 -0.20426264 -0.19481645
# take the exponetial using exp()
(poissonCoefExp <- exp(poissonCoef))
## (Intercept) Month2 Month3 Month4 Month5 Month6
## 0.1294766 0.7062700 0.3944749 0.5578014 0.7474262 0.6650709
## Month7 Month8 Month9 Month10 Month11 Month12
## 0.9550446 0.6020933 0.8152482 0.5813315 0.8152482 0.8229857
# This is because the Poisson coefficients are multiplicative
# Notice that 0.129 * 0.706 = 0.091 from the Poisson coefficents is the same as 0.129-0.038 = 0.091 from the linear model
cellData <- data.frame(dose=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10),
cells=c(1, 0, 0, 0, 0, 2, 0, 1, 2, 0, 3, 0, 2, 2, 1, 0, 1, 2, 2, 2, 2, 3, 5, 3, 0, 3, 6, 2, 4, 4, 2, 2, 8, 4, 4, 4, 7, 2, 6, 5, 2, 5, 8, 4, 7, 4, 4, 7, 9, 3, 6, 7, 9, 5, 3, 5, 5, 3, 4, 11, 2, 7, 9, 3, 4, 2, 6, 5, 5, 6, 4, 5, 8, 10, 11, 9, 8, 8, 11, 7, 10, 12, 9, 12, 10, 12, 9, 17, 6, 9, 15, 11, 11, 10, 4, 9, 13, 8, 8, 13)
)
# Use geom_smooth to plot a continuous predictor variable
ggplot(data = cellData, aes(x = dose, y = cells)) +
geom_jitter(width = 0.05, height = 0.05) +
geom_smooth(method = 'glm', method.args = list(family = 'poisson'))
# Extract out the coefficients
coefOut <- coef(busOut)
# Convert the coefficients to odds-ratios
exp(coefOut)
## (Intercept) CommuteDays
## 0.2334164 1.1386623
# use tidy on busOut and exponentiate the results and extract the confidence interval
broom::tidy(busOut, exponentiate=TRUE, conf.int=TRUE)
## # A tibble: 2 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.233 0.115 -12.7 7.32e-37 0.186 0.292
## 2 CommuteDays 1.14 0.0231 5.62 1.96e- 8 1.09 1.19
str(bus)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 15892 obs. of 3 variables:
## $ CommuteDays: num 5 5 5 5 3 4 5 5 5 5 ...
## $ MilesOneWay: num 19.5 19.5 19.5 19.5 19.5 ...
## $ Bus : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 1 1 1 ...
bus <- bus %>%
mutate(Bus2 = as.integer(Bus)-1)
str(bus)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 15892 obs. of 4 variables:
## $ CommuteDays: num 5 5 5 5 3 4 5 5 5 5 ...
## $ MilesOneWay: num 19.5 19.5 19.5 19.5 19.5 ...
## $ Bus : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 1 1 1 ...
## $ Bus2 : num 1 1 1 1 1 1 0 0 0 0 ...
# add in the missing parts of the ggplot
ggJitter <- ggplot(data = bus, aes(x = MilesOneWay, y = Bus2)) +
geom_jitter(width = 0, height = 0.05)
# add in geom_smooth()
ggJitter + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# add in the missing parts of the ggplot
ggJitter + geom_smooth(method = "glm" , method.args = list(family="binomial"))
# add in the missing parts of the ggplot
ggJitter +
geom_smooth(method = 'glm', method.args = list(family = binomial(link="probit")),
color = 'red', se = FALSE
) +
geom_smooth(method = 'glm', method.args = list(family = binomial(link="logit")),
color = 'blue', se = FALSE
)
Chapter 4 - Multiple Regression with GLMs
Multiple logistic regression:
Formulas in R:
Assumptions of multiple logistic regression:
Wrap up:
Example code includes:
# Build a logistic regression with Bus predicted by CommuteDays and MilesOneWay
busBoth <- glm(Bus ~ CommuteDays + MilesOneWay, data=bus, family="binomial")
# Look at the summary of the output
summary(busBoth)
##
## Call:
## glm(formula = Bus ~ CommuteDays + MilesOneWay, family = "binomial",
## data = bus)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0732 -0.9035 -0.7816 1.3968 2.5066
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.707515 0.119719 -5.910 3.42e-09 ***
## CommuteDays 0.066084 0.023181 2.851 0.00436 **
## MilesOneWay -0.059571 0.003218 -18.512 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19568 on 15891 degrees of freedom
## Residual deviance: 19137 on 15889 degrees of freedom
## AIC: 19143
##
## Number of Fisher Scoring iterations: 4
# Build a logistic regression with Bus predicted by CommuteDays
busDays <- glm(Bus ~ CommuteDays, data=bus, family="binomial")
# Build a logistic regression with Bus predicted by MilesOneWay
busMiles <- glm(Bus ~ MilesOneWay, data=bus, family="binomial")
# Build a glm with CommuteDays first and MilesOneWay second
busOne <- glm(Bus ~ CommuteDays + MilesOneWay, data=bus, family="binomial")
# Build a glm with MilesOneWay first and CommuteDays second
busTwo <- glm(Bus ~ MilesOneWay + CommuteDays, data=bus, family="binomial")
# Print model summaries
summary(busOne)
##
## Call:
## glm(formula = Bus ~ CommuteDays + MilesOneWay, family = "binomial",
## data = bus)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0732 -0.9035 -0.7816 1.3968 2.5066
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.707515 0.119719 -5.910 3.42e-09 ***
## CommuteDays 0.066084 0.023181 2.851 0.00436 **
## MilesOneWay -0.059571 0.003218 -18.512 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19568 on 15891 degrees of freedom
## Residual deviance: 19137 on 15889 degrees of freedom
## AIC: 19143
##
## Number of Fisher Scoring iterations: 4
summary(busTwo)
##
## Call:
## glm(formula = Bus ~ MilesOneWay + CommuteDays, family = "binomial",
## data = bus)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0732 -0.9035 -0.7816 1.3968 2.5066
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.707515 0.119719 -5.910 3.42e-09 ***
## MilesOneWay -0.059571 0.003218 -18.512 < 2e-16 ***
## CommuteDays 0.066084 0.023181 2.851 0.00436 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19568 on 15891 degrees of freedom
## Residual deviance: 19137 on 15889 degrees of freedom
## AIC: 19143
##
## Number of Fisher Scoring iterations: 4
size <- c(1.1, 2.2, 3.3)
count <- c(10, 4, 2)
# use model matrix with size
model.matrix(~ size)
## (Intercept) size
## 1 1 1.1
## 2 1 2.2
## 3 1 3.3
## attr(,"assign")
## [1] 0 1
# use model matirx with count
model.matrix(~ size + count)
## (Intercept) size count
## 1 1 1.1 10
## 2 1 2.2 4
## 3 1 3.3 2
## attr(,"assign")
## [1] 0 1 2
color <- c("red", "blue", "green")
# create a matrix that includes a reference intercept
model.matrix(~ color)
## (Intercept) colorgreen colorred
## 1 1 0 1
## 2 1 0 0
## 3 1 1 0
## attr(,"assign")
## [1] 0 1 1
## attr(,"contrasts")
## attr(,"contrasts")$color
## [1] "contr.treatment"
# create a matrix that includes an intercept for each group
model.matrix(~ color - 1)
## colorblue colorgreen colorred
## 1 0 0 1
## 2 1 0 0
## 3 0 1 0
## attr(,"assign")
## [1] 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$color
## [1] "contr.treatment"
shape <- c("square", "square", "circle")
# create a matrix that includes color and shape
model.matrix(~ color + shape - 1)
## colorblue colorgreen colorred shapesquare
## 1 0 0 1 1
## 2 1 0 0 1
## 3 0 1 0 0
## attr(,"assign")
## [1] 1 1 1 2
## attr(,"contrasts")
## attr(,"contrasts")$color
## [1] "contr.treatment"
##
## attr(,"contrasts")$shape
## [1] "contr.treatment"
# create a matrix that includes shape and color
model.matrix(~ shape + color - 1)
## shapecircle shapesquare colorgreen colorred
## 1 0 1 0 1
## 2 0 1 0 0
## 3 1 0 1 0
## attr(,"assign")
## [1] 1 1 2 2
## attr(,"contrasts")
## attr(,"contrasts")$shape
## [1] "contr.treatment"
##
## attr(,"contrasts")$color
## [1] "contr.treatment"
data("UCBAdmissions", package="datasets")
UCBdata <- as.data.frame(UCBAdmissions) %>%
mutate(Gender=factor(Gender, levels=c("Female", "Male")), Dept=factor(Dept, levels=LETTERS[1:6])) %>%
tidyr::spread(Admit, Freq) %>%
arrange(Dept, Gender)
# build a binomial glm where Admitted and Rejected are predicted by Gender
glm1 <- glm(cbind(Admitted, Rejected) ~ Gender, data=UCBdata, family="binomial")
summary(glm1)
##
## Call:
## glm(formula = cbind(Admitted, Rejected) ~ Gender, family = "binomial",
## data = UCBdata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -16.7915 -4.7613 -0.4365 5.1025 11.2022
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.83049 0.05077 -16.357 <2e-16 ***
## GenderMale 0.61035 0.06389 9.553 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 877.06 on 11 degrees of freedom
## Residual deviance: 783.61 on 10 degrees of freedom
## AIC: 856.55
##
## Number of Fisher Scoring iterations: 4
# build a binomial glm where Admitted and Rejected are predicted by Gender and Dept
glm2 <- glm(cbind(Admitted, Rejected) ~ Gender + Dept, data=UCBdata, family="binomial")
summary(glm2)
##
## Call:
## glm(formula = cbind(Admitted, Rejected) ~ Gender + Dept, family = "binomial",
## data = UCBdata)
##
## Deviance Residuals:
## 1 2 3 4 5 6 7 8
## 3.7189 -1.2487 0.2706 -0.0560 -0.9243 1.2533 -0.0858 0.0826
## 9 10 11 12
## -0.8509 1.2205 0.2052 -0.2076
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.68192 0.09911 6.880 5.97e-12 ***
## GenderMale -0.09987 0.08085 -1.235 0.217
## DeptB -0.04340 0.10984 -0.395 0.693
## DeptC -1.26260 0.10663 -11.841 < 2e-16 ***
## DeptD -1.29461 0.10582 -12.234 < 2e-16 ***
## DeptE -1.73931 0.12611 -13.792 < 2e-16 ***
## DeptF -3.30648 0.16998 -19.452 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 877.056 on 11 degrees of freedom
## Residual deviance: 20.204 on 5 degrees of freedom
## AIC: 103.14
##
## Number of Fisher Scoring iterations: 4
# Add a non-linear equation to a geom_smooth
ggJitter +
geom_smooth(method = 'glm', method.args = list(family = 'binomial'), formula = y~I(x^2), color = 'red')
Chapter 1 - What is Bioconductor?
Introduction to the Bioconductor Project:
Role of S4 in Bioconductor:
Biology of Genomic Datasets:
Example code includes:
# Load the BiocInstaller package
library(BiocInstaller)
# Explicit syntax to check the Bioconductor version
BiocInstaller::biocVersion()
# When BiocInstaller is loaded use biocVersion alone
biocVersion()
# Load the BSgenome package
library(BSgenome)
# Check the version of the BSgenome package
packageVersion("BSgenome")
# Investigate about the a_genome using show()
# show(a_genome)
# Investigate some other accesors
# organism(a_genome)
# provider(a_genome)
# seqinfo(a_genome)
# Load the yeast genome
library(BSgenome.Scerevisiae.UCSC.sacCer3)
# Assign data to the yeastGenome object
yeastGenome <- BSgenome.Scerevisiae.UCSC.sacCer3
# Get the head of seqnames and tail of seqlengths for yeastGenome
head(seqnames(yeastGenome))
tail(seqlengths(yeastGenome))
# Select chromosome M, alias chrM
yeastGenome$chrM
# Count characters of the chrM sequence
nchar(yeastGenome$chrM)
# Assign data to the yeastGenome object
yeastGenome <- BSgenome.Scerevisiae.UCSC.sacCer3
# Get the first 30 bases of each chromosome
getSeq(yeastGenome, start=1, end=30)
Chapter 2 - Biostrings and When to Use Them
Introduction to Biostrings:
Sequence handling:
Why we are interested in patterns:
Example code includes:
# Load packages
library(Biostrings)
# Check the alphabet of the zikaVirus
alphabet(zikaVirus)
# Check the alphabetFrequency of the zikaVirus
alphabetFrequency(zikaVirus)
# Check alphabet of the zikaVirus using baseOnly = TRUE
alphabet(zikaVirus, baseOnly = TRUE)
# Unlist the set and select the first 21 letters as dna_seq, then print it
dna_seq <- DNAString(subseq(as.character(zikaVirus), end = 21))
dna_seq
# 1.1 Transcribe dna_seq as rna_seq, then print it
rna_seq <- RNAString(dna_seq)
rna_seq
# 1.2 Translate rna_seq as aa_seq, then print it
aa_seq <- translate(rna_seq)
aa_seq
# 2.1 Translate dna_seq as aa_seq_2, then print it
aa_seq_2 <- translate(dna_seq)
aa_seq_2
# Create zikv with one collated sequence using `zikaVirus`
zikv <- unlist(zikaVirus)
# Check the length of zikaVirus and zikv
length(zikaVirus)
length(zikv)
# Check the width of zikaVirus
width(zikaVirus)
# Subset zikv to only the first 30 bases
subZikv <- subseq(zikv, end = 30)
subZikv
# The reverse of zikv is
reverse(zikv)
# The complement of zikv is
complement(zikv)
# The reverse complement of zikv is
reverseComplement(zikv)
# The translation of zikv is
translate(zikv)
# Find palindromes in zikv
findPalindromes(zikv)
# print the rnaframesZikaSet
rnaframesZikaSet
# translate all 6 reading frames
AAzika6F <- translate(rnaframesZikaSet)
AAzika6F
# Count the matches allowing 15 mistmatches
vcountPattern(pattern = ns5, subject = AAzika6F, max.mismatch = 15)
# Select the frame that contains the match
selectedSet <- AAzika6F[3]
#Convert this frame into a single sequence
selectedSeq <- unlist(selectedSet)
# Use vmatchPattern with the set
vmatchPattern(pattern = ns5, subject = selectedSet, max.mismatch = 15)
# Use matchPattern with the single sequence
matchPattern(pattern = ns5, subject = selectedSeq, max.mismatch = 15)
Chapter 3 - IRanges and GenomicRanges
IRanges and Genomic Structures:
Gene of Interest:
Manipulating collections of GRanges:
Example code includes:
# load package IRanges
library(IRanges)
# start vector 1 to 5 and end 100
IRnum1 <- IRanges(start=1:5, end=100)
# end 100 and width 89 and 10
IRnum2 <- IRanges(end=100, width=c(89, 10))
# logical argument start = Rle(c(F, T, T, T, F, T, T, T))
IRlog1 <- IRanges(start = Rle(c(FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE)))
# Printing objects in a list
print(list(IRnum1 = IRnum1, IRnum2 = IRnum2, IRlog1 = IRlog1))
# Load Package Genomic Ranges
library(GenomicRanges)
# Print the GRanges object
myGR
# Check the metadata, if any
mcols(myGR)
# load human reference genome hg38
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
# assign hg38 to hg, then print it
hg <- TxDb.Hsapiens.UCSC.hg38.knownGene
hg
# extract all the genes in chromosome X as hg_chrXg, then print it
hg_chrXg <- genes(hg, filter = list(tx_chrom = c("chrX")))
hg_chrXg
# extract all positive stranded genes in chromosome X as hg_chrXgp, then sort it
hg_chrXgp <- genes(hg, filter = list(tx_chrom = c("chrX"), tx_strand = "+"))
sort(hg_chrXgp)
# load the human transcripts DB to hg
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
hg <- TxDb.Hsapiens.UCSC.hg38.knownGene
# prefilter chromosome X
seqlevels(hg) <- c("chrX")
# get all transcripts by gene
hg_chrXt <- transcriptsBy(hg, by="gene")
# select gene `215` from the transcripts
hg_chrXt[[215]]
# load the human transcripts DB to hg
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
hg <- TxDb.Hsapiens.UCSC.hg38.knownGene
# prefilter chromosome X
seqlevels(hg) <- c("chrX")
# get all transcripts by gene
hg_chrXt <- transcriptsBy(hg, by="gene")
# select gene `215` from the transcripts
hg_chrXt[['215']]
# Store the overlapping range in rangefound
rangefound <- subsetByOverlaps(hg_chrX, ABCD1)
# Check names of rangefound
names(rangefound)
# Check the geneOfInterest
ABCD1
# Check rangefound
rangefound
Chapter 4 - Introducing ShortRead
Sequence Files:
unique sequence identifier
Sequence Quality:
as.tibble() %>% # convert to tibble mutate(cycle = 1:50) # add cycle numbers Match and Filter:
Multiple Assessment:
Introduction to Bioconductor:
Example code includes:
# load ShortRead
library(ShortRead)
# print fqsample
fqsample
# class of fqsample
class(fqsample)
# class sread fqsample
class(sread(fqsample))
# id fqsample
id(fqsample)
qaSummary <- qa(fqsample, type = "fastq", lane = 1)
# load ShortRead
library(ShortRead)
# Check quality
quality(fqsample)
# Check encoding
encoding(quality(fqsample))
# Check baseQuality
qaSummary[["baseQuality"]]
# glimpse nucByCycle
glimpse(nucByCycle)
# make an awesome plot!
nucByCycle %>%
# gather the nucleotide letters in alphabet and get a new count column
gather(key = alphabet, value = count , -cycle) %>%
ggplot(aes(x = cycle, y = count, colour = alphabet)) +
geom_line(size = 0.5 ) +
labs(y = "Frequency") +
theme_bw() +
theme(panel.grid.major.x = element_blank())
myStartFilter <- srFilter(function(x) substr(sread(x), 1, 5) == "ATGCA")
# Load package ShortRead
library(ShortRead)
# Check class of fqsample
class(fqsample)
# filter reads into selectedReads using myStartFilter
selectedReads <- fqsample[myStartFilter(fqsample)]
# Check class of selectedReads
class(selectedReads)
# Check detail of selectedReads
detail(selectedReads)
# Load package Rqc
library(Rqc)
# Average per cycle quality plot
rqcCycleAverageQualityPlot(qa)
# Average per cycle quality plot with white background
rqcCycleAverageQualityPlot(qa) + theme_minimal()
# Read quality plot with white background
rqcReadQualityPlot(qa) + theme_minimal()
Chapter 1 - Introduction to Generalized Additive Models
Introduction:
Basis functions and smoothing:
Multivariate GAMs:
Example code includes:
data(mcycle, package="MASS")
# Examine the mcycle data frame
head(mcycle)
## times accel
## 1 2.4 0.0
## 2 2.6 -1.3
## 3 3.2 -2.7
## 4 3.6 0.0
## 5 4.0 -2.7
## 6 6.2 -2.7
plot(mcycle)
# Fit a linear model
lm_mod <- lm(accel ~ times, data = mcycle)
# Visualize the model
termplot(lm_mod, partial.resid = TRUE, se = TRUE)
# Load mgcv
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:forecast':
##
## getResponse
## The following object is masked from 'package:lme4':
##
## lmList
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
# Fit the model
gam_mod <- gam(accel ~ s(times), data = mcycle)
# Plot the results
plot(gam_mod, residuals = TRUE, pch = 1)
# Extract the model coefficients
coef(gam_mod)
## (Intercept) s(times).1 s(times).2 s(times).3 s(times).4 s(times).5
## -25.545865 -63.718008 43.475644 -110.350132 -22.181006 35.034423
## s(times).6 s(times).7 s(times).8 s(times).9
## 93.176458 -9.283018 -111.661472 17.603782
# Fit a GAM with 3 basis functions
gam_mod_k3 <- gam(accel ~ s(times, k = 3), data = mcycle)
# Fit with 20 basis functions
gam_mod_k20 <- gam(accel ~ s(times, k = 20), data = mcycle)
# Visualize the GAMs
par(mfrow = c(1, 2))
plot(gam_mod_k3, residuals = TRUE, pch = 1)
plot(gam_mod_k20, residuals = TRUE, pch = 1)
par(mfrow = c(1, 1))
# Extract the smoothing parameter
gam_mod <- gam(accel ~ s(times), data = mcycle, method = "REML")
gam_mod$sp
## s(times)
## 0.0007758036
# Fix the smoothing paramter at 0.1
gam_mod_s1 <- gam(accel ~ s(times), data = mcycle, sp = 0.1)
# Fix the smoothing paramter at 0.0001
gam_mod_s2 <- gam(accel ~ s(times), data = mcycle, sp = 0.0001)
# Plot both models
par(mfrow = c(2, 1))
plot(gam_mod_s1, residuals = TRUE, pch = 1)
plot(gam_mod_s2, residuals = TRUE, pch = 1)
par(mfrow = c(1, 1))
# Fit the GAM
gam_mod_sk <- gam(accel ~ s(times, k=50), sp=0.0001, data=mcycle)
#Visualize the model
plot(gam_mod_sk, residuals = TRUE, pch = 1)
data(mpg, package="gamair")
# Examine the data
head(mpg)
## symbol loss make fuel aspir doors style drive eng.loc wb
## 1 3 NA alfa-romero gas std two convertible rwd front 88.6
## 2 3 NA alfa-romero gas std two convertible rwd front 88.6
## 3 1 NA alfa-romero gas std two hatchback rwd front 94.5
## 4 2 164 audi gas std four sedan fwd front 99.8
## 5 2 164 audi gas std four sedan 4wd front 99.4
## 6 2 NA audi gas std two sedan fwd front 99.8
## length width height weight eng.type cylinders eng.cc fuel.sys bore stroke
## 1 168.8 64.1 48.8 2548 dohc four 130 mpfi 3.47 2.68
## 2 168.8 64.1 48.8 2548 dohc four 130 mpfi 3.47 2.68
## 3 171.2 65.5 52.4 2823 ohcv six 152 mpfi 2.68 3.47
## 4 176.6 66.2 54.3 2337 ohc four 109 mpfi 3.19 3.40
## 5 176.6 66.4 54.3 2824 ohc five 136 mpfi 3.19 3.40
## 6 177.3 66.3 53.1 2507 ohc five 136 mpfi 3.19 3.40
## comp.ratio hp rpm city.mpg hw.mpg price
## 1 9.0 111 5000 21 27 13495
## 2 9.0 111 5000 21 27 16500
## 3 9.0 154 5000 19 26 16500
## 4 10.0 102 5500 24 30 13950
## 5 8.0 115 5500 18 22 17450
## 6 8.5 110 5500 19 25 15250
str(mpg)
## 'data.frame': 205 obs. of 26 variables:
## $ symbol : int 3 3 1 2 2 2 1 1 1 0 ...
## $ loss : int NA NA NA 164 164 NA 158 NA 158 NA ...
## $ make : Factor w/ 22 levels "alfa-romero",..: 1 1 1 2 2 2 2 2 2 2 ...
## $ fuel : Factor w/ 2 levels "diesel","gas": 2 2 2 2 2 2 2 2 2 2 ...
## $ aspir : Factor w/ 2 levels "std","turbo": 1 1 1 1 1 1 1 1 2 2 ...
## $ doors : Factor w/ 2 levels "four","two": 2 2 2 1 1 2 1 1 1 2 ...
## $ style : Factor w/ 5 levels "convertible",..: 1 1 3 4 4 4 4 5 4 3 ...
## $ drive : Factor w/ 3 levels "4wd","fwd","rwd": 3 3 3 2 1 2 2 2 2 1 ...
## $ eng.loc : Factor w/ 2 levels "front","rear": 1 1 1 1 1 1 1 1 1 1 ...
## $ wb : num 88.6 88.6 94.5 99.8 99.4 ...
## $ length : num 169 169 171 177 177 ...
## $ width : num 64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
## $ height : num 48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
## $ weight : int 2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
## $ eng.type : Factor w/ 7 levels "dohc","dohcv",..: 1 1 6 4 4 4 4 4 4 4 ...
## $ cylinders : Factor w/ 7 levels "eight","five",..: 3 3 4 3 2 2 2 2 2 2 ...
## $ eng.cc : int 130 130 152 109 136 136 136 136 131 131 ...
## $ fuel.sys : Factor w/ 8 levels "1bbl","2bbl",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ bore : num 3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
## $ stroke : num 2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
## $ comp.ratio: num 9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
## $ hp : int 111 111 154 102 115 110 110 110 140 160 ...
## $ rpm : int 5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
## $ city.mpg : int 21 21 19 24 18 19 19 19 17 16 ...
## $ hw.mpg : int 27 27 26 30 22 25 25 25 20 22 ...
## $ price : int 13495 16500 16500 13950 17450 15250 17710 18920 23875 NA ...
# Fit the model
mod_city <- gam(city.mpg ~ s(weight) + s(length) + s(price), data = mpg, method = "REML")
# Plot the model
plot(mod_city, pages = 1)
# Fit the model
mod_city2 <- gam(city.mpg ~ s(weight) + s(length) + s(price) + fuel + drive + style, data = mpg, method = "REML")
# Plot the model
plot(mod_city2, all.terms = TRUE, pages = 1)
# Fit the model
mod_city3 <- gam(city.mpg ~ s(weight, by=drive) + s(length, by=drive) + s(price, by=drive) + drive,
data = mpg, method = "REML"
)
# Plot the model
plot(mod_city3, pages = 1)
Chapter 2 - Interpreting and Visualizing GAMs
Interpreting GAM Outputs:
Visualizing GAMs:
Model checking with gam.check():
Checking concurvity:
Example code includes:
# Fit the model
mod_city4 <- gam(city.mpg ~ s(weight) + s(length) + s(price) + s(rpm) + s(width),
data = mpg, method = "REML")
# View the summary
summary(mod_city4)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## city.mpg ~ s(weight) + s(length) + s(price) + s(rpm) + s(width)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.201 0.188 134 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(weight) 5.620 6.799 17.524 < 2e-16 ***
## s(length) 2.943 3.759 0.904 0.420
## s(price) 1.000 1.000 16.647 6.68e-05 ***
## s(rpm) 7.751 8.499 16.486 < 2e-16 ***
## s(width) 1.003 1.005 0.006 0.939
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.831 Deviance explained = 84.7%
## -REML = 496.47 Scale est. = 7.0365 n = 199
# Fit the model
mod <- gam(accel ~ s(times), data = mcycle, method = "REML")
# Make the plot with residuals
plot(mod, residuals=TRUE)
# Change shape of residuals
plot(mod, residuals=TRUE, pch=1, cex=1)
# Fit the model
mod <- gam(hw.mpg ~ s(weight) + s(rpm) + s(price) + comp.ratio,
data = mpg, method = "REML")
# Plot the price effect
plot(mod, select=c(3))
# Plot all effects
plot(mod, all.terms=TRUE, pages=1)
# Plot the weight effect with colored shading
plot(mod, select = 1, shade=TRUE, shade.col="hotpink")
# Add the intercept value and uncertainty
plot(mod, select = 1, shade=TRUE, shade.col="hotpink", seWithMean=TRUE, shift=coef(mod)[1])
dat <- data.frame(y=c(11.17, 2.81, 12.9, 5.68, 5.58, -1.09, 5.42, 12.13, 4.73, 6.29, 5.74, 8.32, 9.76, 4.78, 9.08, 10.5, 9.4, 9.51, 14.58, 13.84, 4.01, 3.31, 5.32, 6.6, 10.54, 13.19, 10.06, 8.6, -0.62, 4.78, 5.98, 2.75, 1.36, 8.51, 8.12, 4.18, 10.65, 5.92, -0.03, 6.48, 9.12, 6.57, 15.38, 11.76, 7.47, 12, 3.4, 3.39, 0.95, 5.49, 7.92, 8.04, 8.81, 6.65, 8.93, 0.55, 6.73, 3.38, 4.42, 8.23, 12.2, 14.45, 2.82, 5.58, 8.74, 14.14, 5.74, 4.59, 14.54, 6.65, 4.21, 8.71, 1.76, 6.22, 8.87, 10.3, 9.18, 5.05, 5.44, 4.86, 3.25, 4.59, 12.01, 6.69, 6.3, 6.85, 5.45, 15.43, -0.9, 3.43, 9.83, 1.04, 1.16, 16.7, 9.16, 8.46, 7.81, 4.97, 7.46, 1.49, 8.01, 9.48, 9.43, 3.92, 6.2, 7.63, 8.56, 11.53, 9.98, 2.49, 5.67, 3.48, 7.92, 8.62, 7.44, 6.35, 10.88, 9.74, 3.79, 15.43, 6.56, 2.5, 6.66, 9.75, 12.72, 14.64, 8.9, 10.74, 5.93, 2.53, 3.69, 15.25, 0.5, 11.8, 13.19, 6.05, -1.26, 9.09, 9.78, 7.23, 11.67, 12.54, -0.36, 9.4, 7.87, 13.46, 9.33, 2.55, 9.23, 5.95, 10.46, 3.39, 3.81, 7.25, 3.94, 10.18, 8.63, 11.51, 2.42, 9.44, 5.95, 7.75, 10.16, 16.11, 5.16, 3.13, 7.75, 9.96, 7.27, 14.62, 3.88, 10.2, 5.86, 16.18, 5.4, 1.55, 2.91, 9.16, 9.77, 2.25, 5.01, 8.79, 3.34, 7.09, 8.18, 3.34, 8.02, 8.12, 6.69, 3.22, 8.15, 5.01, 11.51, 6.62, 7.07, 0.52, 10.26, 7.99, 8.98, 9.87),
x0=c(0.9, 0.27, 0.37, 0.57, 0.91, 0.2, 0.9, 0.94, 0.66, 0.63, 0.06, 0.21, 0.18, 0.69, 0.38, 0.77, 0.5, 0.72, 0.99, 0.38, 0.78, 0.93, 0.21, 0.65, 0.13, 0.27, 0.39, 0.01, 0.38, 0.87, 0.34, 0.48, 0.6, 0.49, 0.19, 0.83, 0.67, 0.79, 0.11, 0.72, 0.41, 0.82, 0.65, 0.78, 0.55, 0.53, 0.79, 0.02, 0.48, 0.73, 0.69, 0.48, 0.86, 0.44, 0.24, 0.07, 0.1, 0.32, 0.52, 0.66, 0.41, 0.91, 0.29, 0.46, 0.33, 0.65, 0.26, 0.48, 0.77, 0.08, 0.88, 0.34, 0.84, 0.35, 0.33, 0.48, 0.89, 0.86, 0.39, 0.78, 0.96, 0.43, 0.71, 0.4, 0.33, 0.76, 0.2, 0.71, 0.12, 0.25, 0.14, 0.24, 0.06, 0.64, 0.88, 0.78, 0.8, 0.46, 0.41, 0.81, 0.6, 0.65, 0.35, 0.27, 0.99, 0.63, 0.21, 0.13, 0.48, 0.92, 0.6, 0.98, 0.73, 0.36, 0.43, 0.15, 0.01, 0.72, 0.1, 0.45, 0.64, 0.99, 0.5, 0.48, 0.17, 0.75, 0.45, 0.51, 0.21, 0.23, 0.6, 0.57, 0.08, 0.04, 0.64, 0.93, 0.6, 0.56, 0.53, 0.99, 0.51, 0.68, 0.6, 0.24, 0.26, 0.73, 0.45, 0.18, 0.75, 0.1, 0.86, 0.61, 0.56, 0.33, 0.45, 0.5, 0.18, 0.53, 0.08, 0.28, 0.21, 0.28, 0.9, 0.45, 0.78, 0.88, 0.41, 0.06, 0.34, 0.72, 0.34, 0.63, 0.84, 0.86, 0.39, 0.38, 0.9, 0.64, 0.74, 0.61, 0.9, 0.29, 0.19, 0.89, 0.5, 0.88, 0.19, 0.76, 0.72, 0.94, 0.55, 0.71, 0.39, 0.1, 0.93, 0.28, 0.59, 0.11, 0.84, 0.32),
x1=c(0.78, 0.27, 0.22, 0.52, 0.27, 0.18, 0.52, 0.56, 0.13, 0.26, 0.72, 0.96, 0.1, 0.76, 0.95, 0.82, 0.31, 0.65, 0.95, 0.95, 0.34, 0.26, 0.17, 0.32, 0.51, 0.92, 0.51, 0.31, 0.05, 0.42, 0.85, 0.35, 0.13, 0.37, 0.63, 0.39, 0.69, 0.69, 0.55, 0.43, 0.45, 0.31, 0.58, 0.91, 0.14, 0.42, 0.21, 0.43, 0.13, 0.46, 0.94, 0.76, 0.93, 0.47, 0.6, 0.48, 0.11, 0.25, 0.5, 0.37, 0.93, 0.52, 0.32, 0.28, 0.79, 0.7, 0.17, 0.06, 0.75, 0.62, 0.17, 0.06, 0.11, 0.38, 0.17, 0.3, 0.19, 0.26, 0.18, 0.48, 0.77, 0.03, 0.53, 0.88, 0.37, 0.05, 0.14, 0.32, 0.15, 0.13, 0.22, 0.23, 0.13, 0.98, 0.33, 0.51, 0.68, 0.1, 0.12, 0.05, 0.93, 0.67, 0.09, 0.49, 0.46, 0.38, 0.99, 0.18, 0.81, 0.07, 0.4, 0.14, 0.19, 0.84, 0.72, 0.27, 0.5, 0.08, 0.35, 0.97, 0.62, 0.66, 0.31, 0.41, 1, 0.86, 0.95, 0.81, 0.78, 0.27, 0.76, 0.99, 0.29, 0.4, 0.81, 0.08, 0.36, 0.44, 0.16, 0.58, 0.97, 0.99, 0.18, 0.54, 0.38, 0.68, 0.27, 0.47, 0.17, 0.37, 0.73, 0.49, 0.06, 0.78, 0.42, 0.98, 0.28, 0.85, 0.08, 0.89, 0.47, 0.11, 0.33, 0.84, 0.28, 0.59, 0.84, 0.07, 0.7, 0.7, 0.46, 0.44, 0.56, 0.93, 0.23, 0.22, 0.42, 0.33, 0.86, 0.18, 0.49, 0.43, 0.56, 0.66, 0.98, 0.23, 0.24, 0.8, 0.83, 0.11, 0.96, 0.15, 0.14, 0.93, 0.51, 0.15, 0.35, 0.66, 0.31, 0.35),
x2=c(0.15, 0.66, 0.19, 0.95, 0.9, 0.94, 0.72, 0.37, 0.78, 0.01, 0.94, 0.99, 0.36, 0.75, 0.79, 0.71, 0.48, 0.49, 0.31, 0.7, 0.82, 0.43, 0.51, 0.66, 0.14, 0.34, 0.41, 0.09, 0.93, 0.84, 0.88, 0.94, 0.07, 0.38, 0.54, 0.11, 0.8, 0.74, 0.05, 0.48, 0.92, 0.04, 0.29, 0.5, 0.61, 0.26, 0.42, 0.37, 0.94, 0.12, 0.07, 0.96, 0.44, 0.37, 0.14, 0.05, 0.66, 0.58, 0.99, 0.6, 0.06, 0.16, 0.48, 0, 0.44, 0.26, 0.94, 0.72, 0.16, 0.48, 0.69, 0.46, 0.96, 0.71, 0.4, 0.12, 0.24, 0.86, 0.44, 0.5, 0.69, 0.76, 0.16, 0.85, 0.95, 0.59, 0.5, 0.19, 0, 0.88, 0.13, 0.02, 0.94, 0.29, 0.16, 0.4, 0.46, 0.43, 0.52, 0.85, 0.06, 0.55, 0.69, 0.66, 0.66, 0.47, 0.97, 0.4, 0.85, 0.76, 0.53, 0.87, 0.47, 0.01, 0.73, 0.72, 0.19, 0.65, 0.54, 0.34, 0.64, 0.83, 0.71, 0.35, 0.13, 0.39, 0.93, 0.8, 0.76, 0.96, 0.99, 0.61, 0.03, 0.34, 0.28, 0.12, 0.04, 0.37, 0.34, 0.17, 0.62, 0.4, 0.96, 0.65, 0.33, 0.2, 0.12, 1, 0.38, 0.56, 0.73, 0.87, 0.57, 0.01, 0.91, 0.77, 0.38, 0.09, 0.05, 0.82, 0.83, 0.65, 0.13, 0.34, 0.73, 0.91, 0.7, 0.24, 0.64, 0.28, 0.96, 0.16, 0.42, 0.25, 0.09, 0.83, 0.53, 0.67, 0.41, 0.84, 0.74, 0.35, 0.95, 0.65, 0.04, 0.6, 0.42, 0.08, 0.53, 0.96, 0.71, 0.55, 0.24, 0.78, 0.65, 0.83, 0.65, 0.48, 0.5, 0.38),
x3=c(0.45, 0.81, 0.93, 0.15, 0.75, 0.98, 0.97, 0.35, 0.39, 0.95, 0.11, 0.93, 0.35, 0.53, 0.54, 0.71, 0.41, 0.15, 0.34, 0.63, 0.06, 0.85, 0.21, 0.77, 0.14, 0.32, 0.62, 0.26, 0.63, 0.49, 0.94, 0.86, 0.37, 0.31, 0.83, 0.45, 0.32, 0.1, 0.06, 0.69, 0.67, 0.9, 0.3, 0.93, 0.2, 0.79, 0.22, 0.03, 0.86, 0.69, 0.94, 0.68, 0.84, 0.36, 0.39, 0.57, 0.1, 0.19, 0.59, 0.75, 0.87, 0.37, 0.8, 0.06, 0.62, 0.36, 0.59, 0.91, 0.2, 0.37, 0.67, 0.77, 0.52, 0.83, 0.53, 0.5, 0.42, 0.36, 0.12, 0.3, 0.28, 0.79, 0.78, 0.14, 0.52, 0.6, 0.51, 0.39, 0.43, 0.01, 0.92, 0.08, 0.51, 0.82, 0.6, 0.42, 0.56, 0.79, 0.17, 0.97, 0.47, 0.93, 0.9, 0.75, 0.68, 0.65, 0.07, 0.42, 0.53, 0.94, 0.71, 0.72, 0.47, 0.12, 0.78, 0.44, 0.43, 0.03, 0.15, 0.42, 0.77, 0, 0.6, 0.91, 0.71, 0.26, 0.85, 0.33, 0.58, 0.43, 0.05, 0.73, 0.55, 0.75, 0.05, 0.71, 0.3, 0.28, 0.83, 0.09, 0.04, 0.35, 0.54, 0.61, 0.27, 0.21, 0.38, 0.47, 0.84, 0.12, 0.68, 0.5, 0.9, 0.55, 0.13, 0.44, 0.19, 0.43, 0.23, 0.96, 0.45, 0.78, 0.16, 0.87, 0.21, 0.18, 0.16, 0.57, 0.73, 0.88, 0.71, 0.48, 0.82, 0.02, 1, 0.63, 0.43, 0.03, 0.75, 0.21, 1, 0.91, 0.71, 0.73, 0.47, 0.86, 0.17, 0.62, 0.29, 0.46, 0.05, 0.18, 0.06, 0.94, 0.34, 0.52, 0.63, 0.24, 0.52, 0.81)
)
str(dat)
## 'data.frame': 200 obs. of 5 variables:
## $ y : num 11.17 2.81 12.9 5.68 5.58 ...
## $ x0: num 0.9 0.27 0.37 0.57 0.91 0.2 0.9 0.94 0.66 0.63 ...
## $ x1: num 0.78 0.27 0.22 0.52 0.27 0.18 0.52 0.56 0.13 0.26 ...
## $ x2: num 0.15 0.66 0.19 0.95 0.9 0.94 0.72 0.37 0.78 0.01 ...
## $ x3: num 0.45 0.81 0.93 0.15 0.75 0.98 0.97 0.35 0.39 0.95 ...
# Fit the model
mod <- gam(y ~ s(x0, k = 5) + s(x1, k = 5) + s(x2, k = 5) + s(x3, k = 5),
data = dat, method = "REML")
# Run the check function
gam.check(mod)
##
## Method: REML Optimizer: outer newton
## full convergence after 10 iterations.
## Gradient range [-0.0001426464,0.0001241444]
## (score 461.1064 & scale 5.242973).
## Hessian positive definite, eigenvalue range [0.0001426384,97.53228].
## Model rank = 17 / 17
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(x0) 4.00 2.53 0.92 0.12
## s(x1) 4.00 2.22 1.07 0.80
## s(x2) 4.00 3.94 0.84 <2e-16 ***
## s(x3) 4.00 1.00 1.01 0.48
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Fit the model
mod <- gam(y ~ s(x0, k = 3) + s(x1, k = 3) + s(x2, k = 3) + s(x3, k = 3),
data = dat, method = "REML")
# Check the diagnostics
gam.check(mod)
##
## Method: REML Optimizer: outer newton
## full convergence after 10 iterations.
## Gradient range [-0.0002159481,0.0007368124]
## (score 493.6694 & scale 7.805066).
## Hessian positive definite, eigenvalue range [0.0002170151,97.50484].
## Model rank = 9 / 9
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(x0) 2.00 1.85 0.97 0.30
## s(x1) 2.00 1.71 1.06 0.75
## s(x2) 2.00 1.97 0.57 <2e-16 ***
## s(x3) 2.00 1.00 1.09 0.85
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Refit to fix issues
mod2 <- gam(y ~ s(x0, k = 3) + s(x1, k = 3) + s(x2, k = 12) + s(x3, k = 3),
data = dat, method = "REML")
# Check the new model
gam.check(mod2)
##
## Method: REML Optimizer: outer newton
## full convergence after 9 iterations.
## Gradient range [-0.0001262011,0.0001907036]
## (score 452.0731 & scale 4.569005).
## Hessian positive definite, eigenvalue range [0.01536015,97.63581].
## Model rank = 18 / 18
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(x0) 2.00 1.93 0.91 0.10
## s(x1) 2.00 1.89 1.12 0.97
## s(x2) 11.00 8.07 0.97 0.29
## s(x3) 2.00 1.18 1.04 0.76
# Fit the model
mod <- gam(hw.mpg ~ s(length) + s(width) + s(height) + s(weight),
data = mpg, method = "REML")
# Check overall concurvity
concurvity(mod, full=TRUE)
## para s(length) s(width) s(height) s(weight)
## worst 1.079374e-20 0.9303404 0.9322887 0.6723705 0.9603887
## observed 1.079374e-20 0.7534619 0.8757513 0.4869308 0.8793300
## estimate 1.079374e-20 0.8353324 0.7943374 0.4452676 0.8567519
# Check pairwise concurvity
concurvity(mod, full=FALSE)
## $worst
## para s(length) s(width) s(height) s(weight)
## para 1.000000e+00 4.799804e-26 5.458174e-21 4.926340e-23 3.221614e-25
## s(length) 4.759962e-26 1.000000e+00 8.336513e-01 6.058015e-01 8.797217e-01
## s(width) 5.458344e-21 8.336513e-01 1.000000e+00 4.099837e-01 8.953662e-01
## s(height) 4.927251e-23 6.058015e-01 4.099837e-01 1.000000e+00 3.665831e-01
## s(weight) 3.233688e-25 8.797217e-01 8.953662e-01 3.665831e-01 1.000000e+00
##
## $observed
## para s(length) s(width) s(height) s(weight)
## para 1.000000e+00 1.128295e-29 4.467995e-32 9.887661e-34 6.730965e-31
## s(length) 4.759962e-26 1.000000e+00 7.511142e-01 2.827977e-01 8.232449e-01
## s(width) 5.458344e-21 5.077384e-01 1.000000e+00 1.186126e-01 7.813743e-01
## s(height) 4.927251e-23 2.284116e-01 3.313152e-01 1.000000e+00 2.900361e-01
## s(weight) 3.233688e-25 6.052819e-01 7.863555e-01 1.494913e-01 1.000000e+00
##
## $estimate
## para s(length) s(width) s(height) s(weight)
## para 1.000000e+00 1.564968e-28 1.740649e-23 3.448567e-25 1.481483e-27
## s(length) 4.759962e-26 1.000000e+00 6.415191e-01 2.271285e-01 7.209033e-01
## s(width) 5.458344e-21 6.477497e-01 1.000000e+00 1.054762e-01 7.241891e-01
## s(height) 4.927251e-23 3.303484e-01 2.644827e-01 1.000000e+00 2.669300e-01
## s(weight) 3.233688e-25 7.235198e-01 6.913221e-01 1.390568e-01 1.000000e+00
Chapter 3 - Spatial GAMs and Interactions
Two-dimensional smooths and spatial data:
Plotting and interpreting GAM interactions:
Visualizing categorical-continuous interactions:
Interactions with different scales: Tensors:
Example code includes:
# Inspect the data
data(meuse, package="sp")
head(meuse)
## x y cadmium copper lead zinc elev dist om ffreq soil lime
## 1 181072 333611 11.7 85 299 1022 7.909 0.00135803 13.6 1 1 1
## 2 181025 333558 8.6 81 277 1141 6.983 0.01222430 14.0 1 1 1
## 3 181165 333537 6.5 68 199 640 7.800 0.10302900 13.0 1 1 1
## 4 181298 333484 2.6 81 116 257 7.655 0.19009400 8.0 1 2 0
## 5 181307 333330 2.8 48 117 269 7.480 0.27709000 8.7 1 2 0
## 6 181390 333260 3.0 61 137 281 7.791 0.36406700 7.8 1 2 0
## landuse dist.m
## 1 Ah 50
## 2 Ah 30
## 3 Ah 150
## 4 Ga 270
## 5 Ah 380
## 6 Ga 470
str(meuse)
## 'data.frame': 155 obs. of 14 variables:
## $ x : num 181072 181025 181165 181298 181307 ...
## $ y : num 333611 333558 333537 333484 333330 ...
## $ cadmium: num 11.7 8.6 6.5 2.6 2.8 3 3.2 2.8 2.4 1.6 ...
## $ copper : num 85 81 68 81 48 61 31 29 37 24 ...
## $ lead : num 299 277 199 116 117 137 132 150 133 80 ...
## $ zinc : num 1022 1141 640 257 269 ...
## $ elev : num 7.91 6.98 7.8 7.66 7.48 ...
## $ dist : num 0.00136 0.01222 0.10303 0.19009 0.27709 ...
## $ om : num 13.6 14 13 8 8.7 7.8 9.2 9.5 10.6 6.3 ...
## $ ffreq : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ soil : Factor w/ 3 levels "1","2","3": 1 1 1 2 2 2 2 1 1 2 ...
## $ lime : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...
## $ landuse: Factor w/ 15 levels "Aa","Ab","Ag",..: 4 4 4 11 4 11 4 2 2 15 ...
## $ dist.m : num 50 30 150 270 380 470 240 120 240 420 ...
# Fit the 2-D model
mod2d <- gam(cadmium ~ s(x, y), data=meuse, method="REML")
# Inspect the model
summary(mod2d)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## cadmium ~ s(x, y)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2458 0.1774 18.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(x,y) 23.48 27.24 8.667 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.607 Deviance explained = 66.7%
## -REML = 372.07 Scale est. = 4.8757 n = 155
coef(mod2d)
## (Intercept) s(x,y).1 s(x,y).2 s(x,y).3 s(x,y).4 s(x,y).5
## 3.2458065 0.8686658 -10.2154908 6.4161781 -2.6784725 9.1807111
## s(x,y).6 s(x,y).7 s(x,y).8 s(x,y).9 s(x,y).10 s(x,y).11
## 3.7004932 -10.4780937 -8.9821840 -0.6218677 -4.6789789 -5.4267313
## s(x,y).12 s(x,y).13 s(x,y).14 s(x,y).15 s(x,y).16 s(x,y).17
## 7.4996452 8.1962843 -7.6311640 4.5829340 -0.9734724 0.7634059
## s(x,y).18 s(x,y).19 s(x,y).20 s(x,y).21 s(x,y).22 s(x,y).23
## 8.8112827 -4.8639552 -6.8085148 3.8059356 6.3499868 6.4701169
## s(x,y).24 s(x,y).25 s(x,y).26 s(x,y).27 s(x,y).28 s(x,y).29
## -8.1556061 7.2050985 0.1567317 -53.4384704 -4.2860149 5.5212533
# Models of this form (s(x,y) + s(v1) + ...) are a great way to model spatial data because they incorporate spatial relationships as well as independent predictors
# Fit the model
mod2da <- gam(cadmium ~ s(x, y) +s(elev) + s(dist),
data = meuse, method = "REML")
# Inspect the model
summary(mod2da)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## cadmium ~ s(x, y) + s(elev) + s(dist)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2458 0.1238 26.21 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(x,y) 20.398 24.599 2.324 0.00078 ***
## s(elev) 1.282 1.496 28.868 6.52e-08 ***
## s(dist) 6.609 7.698 13.677 5.25e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.809 Deviance explained = 84.4%
## -REML = 321.06 Scale est. = 2.3762 n = 155
# Contour plot
plot(mod2da, pages = 1)
# 3D surface plot
plot(mod2da, scheme=1, pages = 1)
# Colored heat map
plot(mod2da, scheme=2, pages=1)
# Make the perspective plot with error surfaces
vis.gam(mod2d, view = c("x", "y"), plot.type="persp", se=2)
# Rotate the same plot
vis.gam(mod2d, view = c("x", "y"), plot.type="persp", se=2, theta=135)
# Make plot with 5% extrapolation
vis.gam(mod2d, view = c("x", "y"), plot.type = "contour", too.far=0.05)
# Overlay data
points(meuse)
# Make plot with 10% extrapolation
vis.gam(mod2d, view = c("x", "y"), plot.type="contour", too.far=0.1)
# Overlay data
points(meuse)
# Make plot with 25% extrapolation
vis.gam(mod2d, view = c("x", "y"),
plot.type = "contour", too.far = 0.25)
# Overlay data
points(meuse)
# Fit a model with separate smooths for each land-use level
mod_sep <- gam(copper ~ s(dist, by = landuse), data = meuse, method = "REML")
# Examine the summary
summary(mod_sep)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## copper ~ s(dist, by = landuse)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.726 1.371 26.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(dist):landuseAa 1.371 1.605 0.493 0.43082
## s(dist):landuseAb 1.000 1.000 1.674 0.19792
## s(dist):landuseAg 1.514 1.815 0.940 0.28255
## s(dist):landuseAh 2.496 3.081 8.783 1.96e-05 ***
## s(dist):landuseAm 1.000 1.000 8.606 0.00395 **
## s(dist):landuseB 1.000 1.000 1.207 0.27401
## s(dist):landuseBw 1.000 1.000 0.007 0.93520
## s(dist):landuseDEN 1.000 1.000 0.230 0.63255
## s(dist):landuseFh 1.000 1.000 0.698 0.40494
## s(dist):landuseFw 2.754 3.377 5.289 0.00120 **
## s(dist):landuseGa 2.791 2.958 3.720 0.01092 *
## s(dist):landuseSPO 1.000 1.000 1.101 0.29599
## s(dist):landuseSTA 1.245 1.430 0.179 0.65089
## s(dist):landuseTv 1.000 1.000 0.698 0.40495
## s(dist):landuseW 4.333 5.289 37.857 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.642 Deviance explained = 69.9%
## -REML = 580.91 Scale est. = 195.07 n = 154
# Fit a model with a factor-smooth interaction
mod_fs <- gam(copper ~ s(dist, landuse, bs="fs"), data = meuse, method = "REML")
# Examine the summary
summary(mod_fs)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## copper ~ s(dist, landuse, bs = "fs")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.07 3.33 9.031 1.43e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(dist,landuse) 16.37 71 2.463 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.533 Deviance explained = 58.3%
## -REML = 659.94 Scale est. = 254.2 n = 154
# Plot both the models with plot()
plot(mod_sep, pages=1)
plot(mod_fs, pages=1)
# Plot both the models with vis.gam()
vis.gam(mod_sep, view = c("dist", "landuse"), plot.type = "persp")
vis.gam(mod_fs, view = c("dist", "landuse"), plot.type = "persp")
# Fit the model
tensor_mod <- gam(cadmium ~ te(x, y, elev), data=meuse, method="REML")
# Summarize and plot
summary(tensor_mod)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## cadmium ~ te(x, y, elev)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2458 0.1329 24.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(x,y,elev) 38.29 45.86 11.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.78 Deviance explained = 83.4%
## -REML = 318.09 Scale est. = 2.7358 n = 155
plot(tensor_mod)
# Fit the model
tensor_mod2 <- gam(cadmium ~ ti(x, y) + te(elev) + ti(x, y, elev), data=meuse, method="REML")
# Summarize and plot
summary(tensor_mod2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## cadmium ~ ti(x, y) + te(elev) + ti(x, y, elev)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.5102 0.4311 8.143 3.61e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## ti(x,y) 10.80 12.132 6.026 7.40e-09 ***
## te(elev) 2.79 3.099 11.317 1.14e-06 ***
## ti(x,y,elev) 17.20 22.376 2.759 0.00017 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.714 Deviance explained = 77.1%
## -REML = 349.54 Scale est. = 3.5476 n = 155
plot(tensor_mod2, pages = 1)
par(mfrow=c(1, 1))
Chapter 4 - Logistic GAM for Classification
Types of model outcome:
Visualizing logistic GAMs:
Making predictions:
Wrap up and next steps:
Example code includes:
csale <- readRDS("./RInputFiles/csale.rds")
# Examine the csale data frame
head(csale)
## # A tibble: 6 x 8
## purchase n_acts bal_crdt_ratio avg_prem_balance retail_crdt_rat~
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 11 0 2494. 0
## 2 0 0 36.1 2494. 11.5
## 3 0 6 17.6 2494. 0
## 4 0 8 12.5 2494. 0.8
## 5 0 8 59.1 2494. 20.8
## 6 0 1 90.1 2494. 11.5
## # ... with 3 more variables: avg_fin_balance <dbl>, mortgage_age <dbl>,
## # cred_limit <dbl>
str(csale)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1779 obs. of 8 variables:
## $ purchase : num 0 0 0 0 0 0 0 0 1 0 ...
## $ n_acts : num 11 0 6 8 8 1 5 0 9 18 ...
## $ bal_crdt_ratio : num 0 36.1 17.6 12.5 59.1 ...
## $ avg_prem_balance : num 2494 2494 2494 2494 2494 ...
## $ retail_crdt_ratio: num 0 11.5 0 0.8 20.8 ...
## $ avg_fin_balance : num 1767 1767 0 1021 797 ...
## $ mortgage_age : num 182 139 139 139 93 ...
## $ cred_limit : num 12500 0 0 0 0 0 0 0 11500 16000 ...
# Fit a logistic model
log_mod <- gam(purchase ~ s(mortgage_age), data = csale, family=binomial, method = "REML")
# Fit a logistic model
log_mod2 <- gam(purchase ~ s(n_acts) + s(bal_crdt_ratio) + s(avg_prem_balance) +
s(retail_crdt_ratio) + s(avg_fin_balance) + s(mortgage_age) +
s(cred_limit), data = csale, family = binomial, method = "REML")
# View the summary
summary(log_mod2)
##
## Family: binomial
## Link function: logit
##
## Formula:
## purchase ~ s(n_acts) + s(bal_crdt_ratio) + s(avg_prem_balance) +
## s(retail_crdt_ratio) + s(avg_fin_balance) + s(mortgage_age) +
## s(cred_limit)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.64060 0.07557 -21.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(n_acts) 3.474 4.310 93.670 < 2e-16 ***
## s(bal_crdt_ratio) 4.308 5.257 18.386 0.00318 **
## s(avg_prem_balance) 2.275 2.816 7.800 0.04958 *
## s(retail_crdt_ratio) 1.001 1.001 1.422 0.23343
## s(avg_fin_balance) 1.850 2.202 2.506 0.27895
## s(mortgage_age) 4.669 5.710 9.656 0.13401
## s(cred_limit) 1.001 1.002 23.066 1.58e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.184 Deviance explained = 18.4%
## -REML = 781.37 Scale est. = 1 n = 1779
# Plot on the log-odds scale
plot(log_mod2, pages=1)
# Plot on the probability scale
plot(log_mod2, pages = 1, trans = plogis)
# Plot with the intercept
plot(log_mod2, pages = 1, trans = plogis, shift = coef(log_mod2)[1])
# Plot with intercept uncertainty
plot(log_mod2, pages = 1, trans = plogis, shift = coef(log_mod2)[1], seWithMean = TRUE)
new_credit_data <- data.frame(matrix(data=c(1, 0, 0, 0, 0, 0, 0, 2, 19, 0, 0, 1, 6, 3, 0.3, 4.2, 36.095, 36.095, 25.7, 45.6, 10.8, 61, 967, 2494.414, 2494.414, 2494.414, 195, 2494.414, 11.491, 0, 11.491, 11.491, 11.491, 0, 11.491, 1767.197, 249, 1767.197, 1767.197, 1767.197, 0, 1767.197, 155, 65, 138.96, 138.96, 138.96, 13, 138.96, 0, 10000, 0, 0, 0, 13800, 0), ncol=8, nrow=7, byrow=FALSE))
names(new_credit_data) <- c('purchase', 'n_acts', 'bal_crdt_ratio', 'avg_prem_balance', 'retail_crdt_ratio', 'avg_fin_balance', 'mortgage_age', 'cred_limit')
new_credit_data
## purchase n_acts bal_crdt_ratio avg_prem_balance retail_crdt_ratio
## 1 1 2 0.300 61.000 11.491
## 2 0 19 4.200 967.000 0.000
## 3 0 0 36.095 2494.414 11.491
## 4 0 0 36.095 2494.414 11.491
## 5 0 1 25.700 2494.414 11.491
## 6 0 6 45.600 195.000 0.000
## 7 0 3 10.800 2494.414 11.491
## avg_fin_balance mortgage_age cred_limit
## 1 1767.197 155.00 0
## 2 249.000 65.00 10000
## 3 1767.197 138.96 0
## 4 1767.197 138.96 0
## 5 1767.197 138.96 0
## 6 0.000 13.00 13800
## 7 1767.197 138.96 0
# Calculate predictions and errors
predictions <- predict(log_mod2, newdata = new_credit_data,
type = "link", se.fit = TRUE)
# Calculate high and low predictions intervals
high_pred <- predictions$fit + 2*predictions$se.fit
low_pred <- predictions$fit - 2*predictions$se.fit
# Convert intervals to probability scale
high_prob <- 1 / (1/exp(high_pred) + 1)
low_prob <- 1 / (1/exp(low_pred) + 1)
# Inspect
high_prob
## 1 2 3 4 5 6 7
## 0.29534339 0.80264652 0.07015439 0.07015439 0.12907886 0.37928666 0.27248599
low_prob
## 1 2 3 4 5 6 7
## 0.15023805 0.53570264 0.03758807 0.03758807 0.06804119 0.11279970 0.15491807
# Predict from the model
prediction_1 <- predict(log_mod2, newdata = new_credit_data[1, ,drop=FALSE], type = "terms")
# Inspect
prediction_1
## s(n_acts) s(bal_crdt_ratio) s(avg_prem_balance) s(retail_crdt_ratio)
## 1 -0.3626621 0.3352521 0.369506 -0.007531015
## s(avg_fin_balance) s(mortgage_age) s(cred_limit)
## 1 -0.04057248 -0.1774484 0.2229033
## attr(,"constant")
## (Intercept)
## -1.640601
Chapter 1 - Foundations of Tidy Machine Learning
Introduction:
Map family of functions:
Tidy models with broom:
Example code includes:
# Explore gapminder
data(gapminder, package="gapminder")
head(gapminder)
# Prepare the nested dataframe gap_nested
gap_nested <- gapminder %>%
group_by(country) %>%
nest()
# Explore gap_nested
head(gap_nested)
# Create the unnested dataframe called gap_unnnested
gap_unnested <- gap_nested %>%
unnest()
# Confirm that your data was not modified
identical(gapminder, gap_unnested)
# Extract the data of Algeria
algeria_df <- gap_nested$data[[which(gap_nested$country=="Algeria")]]
# Calculate the minimum of the population vector
min(algeria_df$pop)
# Calculate the maximum of the population vector
max(algeria_df$pop)
# Calculate the mean of the population vector
mean(algeria_df$pop)
# Calculate the mean population for each country
pop_nested <- gap_nested %>%
mutate(mean_pop = map(.x=data, .f=~mean(.x$pop)))
# Take a look at pop_nested
head(pop_nested)
# Extract the mean_pop value by using unnest
pop_mean <- pop_nested %>%
unnest(mean_pop)
# Take a look at pop_mean
head(pop_mean)
# Calculate mean population and store result as a double
pop_mean <- gap_nested %>%
mutate(mean_pop = map_dbl(.x=data, ~mean(.x$pop)))
# Take a look at pop_mean
head(pop_mean)
# Build a linear model for each country
gap_models <- gap_nested %>%
mutate(model = map(.x=data, .f=~lm(formula = lifeExp ~ year, data = .x)))
# Extract the model for Algeria
algeria_model <- gap_models$model[[which(gap_models$country=="Algeria")]]
# View the summary for the Algeria model
summary(algeria_model)
# Extract the coefficients of the algeria_model as a dataframe
broom::tidy(algeria_model)
# Extract the statistics of the algeria_model as a dataframe
broom::glance(algeria_model)
# Build the augmented dataframe
algeria_fitted <- broom::augment(algeria_model)
# Compare the predicted values with the actual values of life expectancy
algeria_fitted %>%
ggplot(aes(x = year)) +
geom_point(aes(y = lifeExp)) +
geom_line(aes(y = .fitted), color = "red")
Chapter 2 - Multiple Models with broom
Exploring coefficients across models:
Evaluating fit of many models:
Visually inspect the fit of many models:
Improve the fit of your models:
Example code includes:
# Extract the coefficient statistics of each model into nested dataframes
model_coef_nested <- gap_models %>%
mutate(coef = map(.x=model, .f=~broom::tidy(.x)))
# Simplify the coef dataframes for each model
model_coef <- model_coef_nested %>%
unnest(coef)
# Plot a histogram of the coefficient estimates for year
model_coef %>%
filter(term=="year") %>%
ggplot(aes(x = estimate)) +
geom_histogram()
# Extract the fit statistics of each model into dataframes
model_perf_nested <- gap_models %>%
mutate(fit = map(.x=model, .f=~broom::glance(.x)))
# Simplify the fit dataframes for each model
model_perf <- model_perf_nested %>%
unnest(fit)
# Look at the first six rows of model_perf
head(model_perf)
# Plot a histogram of rsquared for the 77 models
model_perf %>%
ggplot(aes(x=r.squared)) +
geom_histogram()
# Extract the 4 best fitting models
best_fit <- model_perf %>%
top_n(n = 4, wt = r.squared)
# Extract the 4 models with the worst fit
worst_fit <- model_perf %>%
top_n(n = 4, wt = -r.squared)
best_augmented <- best_fit %>%
# Build the augmented dataframe for each country model
mutate(augmented = map(.x=model, .f=~broom::augment(.x))) %>%
# Expand the augmented dataframes
unnest(augmented)
worst_augmented <- worst_fit %>%
# Build the augmented dataframe for each country model
mutate(augmented = map(.x=model, .f=~broom::augment(.x))) %>%
# Expand the augmented dataframes
unnest(augmented)
# Compare the predicted values with the actual values of life expectancy
# for the top 4 best fitting models
best_augmented %>%
ggplot(aes(x=year)) +
geom_point(aes(y=lifeExp)) +
geom_line(aes(y=.fitted), color = "red") +
facet_wrap(~country, scales = "free_y")
# Compare the predicted values with the actual values of life expectancy
# for the top 4 worst fitting models
worst_augmented %>%
ggplot(aes(x=year)) +
geom_point(aes(y=lifeExp)) +
geom_line(aes(y=.fitted), color = "red") +
facet_wrap(~country, scales = "free_y")
# Build a linear model for each country using all features
gap_fullmodel <- gap_nested %>%
mutate(model = map(data, ~lm(formula = lifeExp ~ year + pop + gdpPercap, data = .x)))
fullmodel_perf <- gap_fullmodel %>%
# Extract the fit statistics of each model into dataframes
mutate(fit = map(model, ~broom::glance(.x))) %>%
# Simplify the fit dataframes for each model
unnest(fit)
# View the performance for the four countries with the worst fitting
# four simple models you looked at before
fullmodel_perf %>%
filter(country %in% worst_fit$country) %>%
select(country, adj.r.squared)
Chapter 3 - Build, Tune, and Evaluate Regression Models
Training, test, and validation splits:
Measuring cross-validation performance:
Building and tuning a random-forest model:
Measuring the test performance:
Example code includes:
set.seed(42)
# Prepare the initial split object
gap_split <- rsample::initial_split(gapminder, prop = 0.75)
# Extract the training dataframe
training_data <- rsample::training(gap_split)
# Extract the testing dataframe
testing_data <- rsample::testing(gap_split)
# Calculate the dimensions of both training_data and testing_data
dim(training_data)
dim(testing_data)
set.seed(42)
# Prepare the dataframe containing the cross validation partitions
cv_split <- rsample::vfold_cv(training_data, v = 5)
cv_data <- cv_split %>%
mutate(
# Extract the train dataframe for each split
train = map(splits, ~rsample::training(.x)),
# Extract the validate dataframe for each split
validate = map(splits, ~rsample::testing(.x))
)
# Use head() to preview cv_data
head(cv_data)
# Build a model using the train data for each fold of the cross validation
cv_models_lm <- cv_data %>%
mutate(model = map(train, ~lm(formula = lifeExp ~ ., data = .x)))
cv_prep_lm <- cv_models_lm %>%
mutate(
# Extract the recorded life expectancy for the records in the validate dataframes
validate_actual = map(.x=validate, .f=~.x$lifeExp),
# Predict life expectancy for each validate set using its corresponding model
validate_predicted = map2(.x=model, .y=validate, .f=~predict(.x, .y))
)
library(Metrics)
# Calculate the mean absolute error for each validate fold
cv_eval_lm <- cv_prep_lm %>%
mutate(validate_mae = map2_dbl(.x=validate_actual, .y=validate_predicted,
.f=~mae(actual = .x, predicted = .y)
)
)
# Print the validate_mae column
cv_eval_lm$validate_mae
# Calculate the mean of validate_mae column
mean(cv_eval_lm$validate_mae)
library(ranger)
# Build a random forest model for each fold
cv_models_rf <- cv_data %>%
mutate(model = map(train, ~ranger(formula = lifeExp ~ ., data = .x,
num.trees = 100, seed = 42)))
# Generate predictions using the random forest model
cv_prep_rf <- cv_models_rf %>%
mutate(validate_predicted = map2(.x=model, .y=validate, .f=~predict(.x, .y)$predictions))
# Calculate validate MAE for each fold
cv_eval_rf <- cv_prep_rf %>%
mutate(validate_actual=map(.x=validate, .f=~.x$lifeExp),
validate_mae = map2_dbl(.x=validate_actual, .y=validate_predicted,
.f=~mae(actual = .x, predicted = .y)
)
)
# Print the validate_mae column
cv_eval_rf$validate_mae
# Calculate the mean of validate_mae column
mean(cv_eval_rf$validate_mae)
# Prepare for tuning your cross validation folds by varying mtry
cv_tune <- cv_data %>%
tidyr::crossing(mtry = 2:5)
# Build a model for each fold & mtry combination
cv_model_tunerf <- cv_tune %>%
mutate(model = map2(.x=train, .y=mtry, ~ranger(formula = lifeExp ~ .,
data = .x, mtry = .y,
num.trees = 100, seed = 42)))
# Generate validate predictions for each model
cv_prep_tunerf <- cv_model_tunerf %>%
mutate(validate_predicted = map2(.x=model, .y=validate, .f=~predict(.x, .y)$predictions))
# Calculate validate MAE for each fold and mtry combination
cv_eval_tunerf <- cv_prep_tunerf %>%
mutate(validate_actual=map(.x=validate, .f=~.x$lifeExp),
validate_mae = map2_dbl(.x=validate_actual, .y=validate_predicted,
.f=~mae(actual = .x, predicted = .y)
)
)
# Calculate the mean validate_mae for each mtry used
cv_eval_tunerf %>%
group_by(mtry) %>%
summarise(mean_mae = mean(validate_mae))
# Build the model using all training data and the best performing parameter
best_model <- ranger(formula = lifeExp ~ ., data = training_data,
mtry = 4, num.trees = 100, seed = 42)
# Prepare the test_actual vector
test_actual <- testing_data$lifeExp
# Predict life_expectancy for the testing_data
test_predicted <- predict(best_model, testing_data)$predictions
# Calculate the test MAE
mae(test_actual, test_predicted)
Chapter 4 - Build, Tune, and Evaluate Classification Models
Logistic Regression Models:
Evaluating Classification Models:
Random Forest for Classification:
Wrap Up:
Example code includes:
attrition <- readRDS("./RInputFiles/attrition.rds")
str(attrition)
head(attrition)
set.seed(42)
# Prepare the initial split object
data_split <- rsample::initial_split(data=attrition, prop=0.75)
# Extract the training dataframe
training_data <- rsample::training(data_split)
# Extract the testing dataframe
testing_data <- rsample::testing(data_split)
set.seed(42)
cv_split <- rsample::vfold_cv(training_data, v=5)
cv_data <- cv_split %>%
mutate(
# Extract the train dataframe for each split
train = map(.x=splits, .f=~rsample::training(.x)),
# Extract the validate dataframe for each split
validate = map(.x=splits, .f=~rsample::testing(.x))
)
# Build a model using the train data for each fold of the cross validation
cv_models_lr <- cv_data %>%
mutate(model = map(train, ~glm(formula = Attrition ~ ., data = .x, family = "binomial")))
# Extract the first model and validate
model <- cv_models_lr$model[[1]]
validate <- cv_models_lr$validate[[1]]
# Prepare binary vector of actual Attrition values in validate
validate_actual <- validate$Attrition == "Yes"
# Predict the probabilities for the observations in validate
validate_prob <- predict(model, validate, type = "response")
# Prepare binary vector of predicted Attrition values for validate
validate_predicted <- validate_prob > 0.5
library(Metrics)
# Compare the actual & predicted performance visually using a table
table(validate_actual, validate_predicted)
# Calculate the accuracy
accuracy(validate_actual, validate_predicted)
# Calculate the precision
precision(validate_actual, validate_predicted)
# Calculate the recall
recall(validate_actual, validate_predicted)
cv_prep_lr <- cv_models_lr %>%
mutate(
# Prepare binary vector of actual Attrition values in validate
validate_actual = map(.x=validate, ~.x$Attrition == "Yes"),
# Prepare binary vector of predicted Attrition values for validate
validate_predicted = map2(.x=model, .y=validate, .f=~predict(.x, .y, type = "response") > 0.5)
)
# Calculate the validate recall for each cross validation fold
cv_perf_recall <- cv_prep_lr %>%
mutate(validate_recall = map2_dbl(.x=validate_actual, .y=validate_predicted, .f=~recall(actual = .x, predicted = .y)))
# Print the validate_recall column
cv_perf_recall$validate_recall
# Calculate the average of the validate_recall column
mean(cv_perf_recall$validate_recall)
library(ranger)
# Prepare for tuning your cross validation folds by varying mtry
cv_tune <- cv_data %>%
crossing(mtry = c(2, 4, 8, 16))
# Build a cross validation model for each fold & mtry combination
cv_models_rf <- cv_tune %>%
mutate(model = map2(train, mtry, ~ranger(formula = Attrition~.,
data = .x, mtry = .y,
num.trees = 100, seed = 42)))
cv_prep_rf <- cv_models_rf %>%
mutate(
# Prepare binary vector of actual Attrition values in validate
validate_actual = map(validate, ~.x$Attrition == "Yes"),
# Prepare binary vector of predicted Attrition values for validate
validate_predicted = map2(.x=model, .y=validate, ~predict(.x, .y, type = "response")$predictions=="Yes")
)
# Calculate the validate recall for each cross validation fold
cv_perf_recall <- cv_prep_rf %>%
mutate(recall = map2_dbl(.x=validate_actual, .y=validate_predicted, ~recall(actual=.x, predicted=.y)))
# Calculate the mean recall for each mtry used
cv_perf_recall %>%
group_by(mtry) %>%
summarise(mean_recall = mean(recall))
# Build the logistic regression model using all training data
best_model <- glm(formula = Attrition ~ .,
data = training_data, family = "binomial")
# Prepare binary vector of actual Attrition values for testing_data
test_actual <- testing_data$Attrition == "Yes"
# Prepare binary vector of predicted Attrition values for testing_data
test_predicted <- predict(best_model, newdata=testing_data, type = "response") > 0.5
# Compare the actual & predicted performance visually using a table
table(test_actual, test_predicted)
# Calculate the test accuracy
accuracy(test_actual, test_predicted)
# Calculate the test precision
precision(test_actual, test_predicted)
# Calculate the test recall
recall(test_actual, test_predicted)
Chapter 1 - Introduction, Networks, and Labeled Networks
Introduction:
Labeled Networks, Social Influence:
Challenges:
Example code includes:
library(igraph)
# load("./RInputFiles/StudentEdgelist.RData")
# Create edgeList
elFrom <- c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 13, 13, 14, 14, 14, 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 20, 20, 20, 21, 21, 22, 23, 23, 23, 23, 24, 24, 24, 25, 25, 25, 25, 25, 26, 26, 27, 28, 28, 28, 29, 29, 29, 29, 32, 32, 32, 32, 32, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 37, 37, 583, 38, 38, 39, 39, 40, 40, 40, 40, 343, 42, 42, 42, 42, 43, 43, 43, 43, 43, 43, 43, 43, 44, 44, 44, 44, 45, 45, 45, 45, 45, 45, 47, 47, 47, 47, 47, 48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 53, 54, 54, 54, 54, 54, 54, 54, 54, 54, 55, 55, 56, 56, 56, 56, 56, 684, 57, 57, 58, 58, 58, 58, 191, 59, 59, 59, 60, 60, 60, 60, 61, 61, 61, 61, 61, 61, 61, 62, 63, 63, 63, 64, 64, 64, 64, 64, 64, 99, 65, 66, 66, 67, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 71, 71, 71, 71, 72, 72, 72, 73, 73, 73, 73, 74, 75, 76, 76, 76, 76, 76, 76, 238, 77, 77, 77, 78, 78, 78, 78, 78, 78, 78, 179, 80, 81, 707, 82, 83, 83, 83, 84, 84, 84, 84, 85, 85, 609, 85, 86, 86, 689, 86, 86, 87, 87, 87, 87, 87, 286, 459, 633, 88, 88, 89, 89, 89, 89, 90, 90, 90, 90, 91, 91, 92, 583, 92, 93, 93, 93, 94, 94, 358, 532, 95, 96, 97, 97, 707, 97, 98, 98, 98, 98, 98, 98, 98, 99, 99, 99, 99, 99, 807, 976, 101, 101, 101, 102, 102, 102, 102, 102, 784, 103, 104, 104, 105, 105, 106, 106, 106, 106, 106, 107, 583, 294, 109, 109, 110, 110, 111, 111, 111, 112, 112, 278, 113, 114, 114, 114, 114, 114, 115, 115, 115, 406, 116, 116, 116, 116, 267, 388, 117, 117, 117, 117, 117, 118, 358, 118, 119, 119, 119, 119, 119, 343, 120, 120, 121, 121, 121, 138, 207, 122, 122, 122, 122, 122, 889, 123, 123, 123, 123, 123, 124, 125, 125, 125, 125, 125, 125, 126, 126, 793, 889, 127, 128, 128, 128, 128, 128, 129, 707, 129, 337, 130, 130, 131, 131, 131, 131, 132, 133, 133, 133, 133, 532, 135, 406, 135, 186, 136, 364, 139, 139, 736, 467, 141, 288, 317, 142, 142, 142, 976, 143, 143, 143, 144, 144, 144, 889, 988, 145, 145, 146, 146, 147, 288, 148, 149, 149, 149, 661, 149, 149, 149, 172, 150, 150, 150, 245, 152, 153, 153, 154, 154, 155, 155, 155, 186, 467, 156, 157, 157, 332, 642, 190, 159, 159, 159, 159, 159, 160, 160, 161, 161, 290, 162, 162, 163, 163, 163, 531, 164, 164, 164, 165, 406, 165, 165, 165, 166, 166, 166, 166, 167, 167, 168, 168, 233, 413, 683, 931, 170, 839, 170, 171, 171, 171, 171, 171, 172, 172, 400, 506, 184, 174, 174, 174, 174, 174, 175, 176, 176, 177, 177, 177, 177, 177, 177, 177, 177, 177, 393, 178, 178, 179, 179, 179, 180, 981, 181, 181, 181, 182, 183, 183, 185, 185, 186, 270, 187, 410, 625, 189, 189, 189, 190, 190, 190, 191, 191, 191, 191, 302, 192, 193, 193, 523, 741, 195, 195, 195, 424, 196, 349, 197, 197, 246, 198, 198, 198, 198, 199, 531, 200, 201, 201, 201, 201, 201, 411, 204, 205, 205, 205, 206, 206, 206, 206, 207, 207, 207, 207, 354, 208, 209, 587, 717, 211, 211, 211, 630, 666, 211, 212, 212, 212, 545, 212, 212, 456, 889, 214, 216, 216, 216)
elFrom <- c(elFrom, 216, 216, 927, 883, 422, 946, 219, 406, 734, 874, 221, 221, 222, 222, 853, 222, 222, 222, 223, 223, 223, 224, 224, 224, 274, 406, 227, 227, 228, 228, 229, 230, 230, 230, 964, 231, 231, 231, 864, 597, 232, 233, 675, 235, 964, 237, 416, 482, 568, 236, 237, 238, 238, 238, 239, 671, 734, 975, 240, 684, 240, 650, 241, 424, 817, 243, 683, 243, 243, 307, 456, 245, 245, 246, 968, 259, 247, 247, 669, 247, 487, 248, 248, 249, 689, 250, 250, 250, 251, 251, 251, 251, 891, 375, 855, 252, 253, 253, 253, 690, 789, 255, 255, 284, 587, 730, 256, 545, 257, 257, 281, 336, 614, 258, 258, 259, 259, 975, 274, 260, 260, 260, 260, 698, 261, 817, 262, 262, 262, 262, 803, 816, 406, 264, 264, 853, 265, 488, 601, 266, 267, 269, 269, 269, 269, 269, 270, 290, 271, 271, 927, 594, 274, 274, 274, 274, 963, 276, 276, 276, 454, 278, 278, 279, 279, 279, 329, 384, 280, 281, 822, 281, 482, 282, 284, 284, 284, 284, 285, 285, 285, 286, 286, 286, 286, 287, 287, 288, 288, 288, 289, 290, 449, 291, 669, 291, 788, 659, 292, 293, 522, 296, 296, 803, 296, 297, 299, 300, 300, 300, 300, 643, 301, 302, 302, 302, 302, 303, 303, 303, 303, 531, 704, 304, 305, 789, 306, 306, 306, 306, 308, 336, 309, 309, 745, 312, 312, 313, 313, 313, 637, 649, 790, 802, 960, 343, 524, 316, 317, 317, 318, 335, 319, 319, 319, 689, 320, 964, 321, 321, 321, 321, 323, 323, 323, 324, 324, 868, 698, 327, 529, 329, 329, 329, 329, 960, 332, 746, 811, 822, 865, 335, 334, 334, 334, 424, 335, 335, 336, 336, 336, 633, 931, 401, 341, 407, 412, 842, 343, 343, 343, 532, 637, 344, 553, 597, 399, 758, 346, 347, 347, 348, 348, 348, 348, 349, 349, 349, 817, 351, 351, 807, 855, 353, 353, 976, 354, 354, 354, 356, 356, 371, 358, 358, 707, 859, 994, 690, 360, 360, 361, 361, 361, 361, 973, 375, 362, 784, 365, 365, 365, 732, 890, 972, 637, 368, 369, 369, 369, 370, 643, 370, 370, 371, 372, 404, 373, 997, 375, 375, 376, 568, 377, 377, 378, 402, 840, 380, 425, 465, 569, 382, 382, 803, 894, 384, 384, 608, 526, 388, 388, 390, 390, 390, 390, 390, 390, 672, 392, 393, 660, 395, 395, 743, 397, 398, 399, 996, 460, 401, 401, 401, 402, 403, 422, 403, 413, 406, 406, 406, 406, 408, 408, 930, 409, 409, 409, 409, 410, 410, 410, 411, 411, 412, 413, 414, 415, 415, 855, 416, 416, 445, 417, 417, 417, 871, 420, 946, 420, 420, 422, 877, 423, 424, 424, 425, 426, 822, 430, 429, 429, 429, 732, 429, 429, 430, 433, 669, 433, 467, 433, 434, 435, 436, 436, 812, 437, 437, 853, 963, 438, 439, 446, 440, 608, 440, 914, 441, 692, 441, 862, 442, 442, 443, 446, 744, 446, 880, 449, 482, 556, 912, 957, 453, 453, 453, 994, 454, 454, 551, 561, 998, 457, 457, 788, 473, 458, 689, 459, 689, 460, 460, 679, 812, 884, 886, 717, 741, 466, 467, 467, 760, 834, 529, 738, 470, 470, 503, 471, 471, 472, 472, 473, 473, 999, 474, 474, 474, 643, 835, 476, 690, 859, 478, 479, 877, 480, 480, 480, 503, 774, 482, 483, 584, 733, 487, 487, 488, 488, 642, 488, 577, 491, 802, 492, 493, 494, 494, 498, 496, 496, 642, 726, 497, 914, 498, 499, 499, 499, 500, 500, 942, 581, 915, 501, 543, 701, 770, 503, 503, 748, 506, 506, 506, 506, 737, 982, 913, 510, 566, 511, 692, 767, 932, 737, 660, 742, 552, 518, 700, 854, 640, 583, 521, 521, 522, 522, 524, 683, 718, 525, 526, 527, 759, 529, 530, 530, 998, 531, 531, 531, 532, 532, 532, 532, 532, 577)
elFrom <- c(elFrom, 533, 533, 877, 534, 534, 534, 734, 535, 535, 561, 536, 538, 538, 538, 539, 614, 652, 541, 541, 541, 997, 543, 543, 543, 946, 545, 791, 549, 550, 963, 705, 809, 554, 554, 555, 912, 556, 556, 556, 556, 557, 958, 632, 665, 719, 560, 560, 561, 570, 628, 707, 777, 923, 976, 564, 564, 738, 564, 564, 565, 565, 566, 566, 567, 567, 567, 569, 570, 958, 679, 958, 908, 638, 736, 577, 653, 808, 607, 645, 774, 865, 636, 583, 583, 999, 585, 585, 587, 840, 609, 589, 589, 995, 592, 874, 625, 698, 594, 595, 596, 822, 732, 823, 607, 771, 724, 804, 600, 839, 601, 731, 800, 854, 645, 607, 607, 607, 607, 608, 609, 610, 610, 891, 943, 613, 615, 800, 839, 638, 619, 619, 883, 620, 620, 768, 622, 877, 623, 623, 623, 633, 892, 889, 787, 984, 748, 761, 929, 690, 930, 630, 630, 632, 633, 685, 634, 634, 635, 635, 635, 637, 968, 883, 638, 638, 816, 642, 669, 643, 643, 644, 646, 895, 740, 649, 650, 650, 961, 651, 651, 856, 730, 849, 680, 908, 960, 661, 662, 931, 955, 666, 666, 669, 669, 669, 670, 671, 671, 768, 672, 812, 818, 675, 677, 677, 677, 679, 679, 679, 889, 683, 684, 732, 685, 876, 770, 687, 788, 690, 692, 692, 692, 692, 692, 790, 916, 698, 718, 722, 743, 937, 702, 704, 707, 759, 917, 713, 713, 732, 715, 892, 717, 718, 874, 921, 942, 744, 722, 891, 968, 724, 724, 726, 726, 881, 732, 733, 734, 793, 892, 740, 970, 742, 743, 744, 744, 784, 746, 886, 807, 770, 996, 994, 856, 878, 813, 769, 771, 902, 956, 982, 780, 810, 874, 825, 873, 840, 924, 789, 988, 790, 811, 791, 875, 795, 795, 800, 802, 964, 805, 980, 808, 808, 808, 874, 874, 811, 922, 819, 868, 929, 881, 822, 822, 952, 865, 963, 981, 988, 872, 832, 839, 960, 851, 855, 855, 860, 918, 861, 875, 875, 984, 888, 890, 897, 901, 998, 905, 961, 907, 918, 988, 976, 982, 922, 923, 924, 947, 970, 974, 997, 999, 942, 952, 984, 992)
elTo <- c(250, 308, 413, 525, 803, 894, 332, 433, 474, 847, 963, 968, 147, 290, 337, 393, 474, 179, 193, 233, 737, 793, 838, 684, 718, 237, 404, 698, 724, 285, 641, 86, 285, 376, 689, 758, 889, 145, 410, 544, 583, 835, 96, 788, 924, 43, 91, 446, 181, 289, 378, 406, 547, 784, 189, 399, 482, 822, 262, 308, 817, 832, 260, 997, 81, 229, 839, 56, 840, 183, 186, 397, 676, 760, 344, 534, 980, 303, 343, 395, 925, 988, 483, 522, 132, 335, 506, 643, 304, 704, 871, 872, 466, 524, 567, 683, 997, 264, 279, 896, 105, 356, 460, 568, 726, 789, 865, 902, 951, 988, 138, 293, 38, 614, 633, 224, 550, 64, 224, 463, 521, 41, 347, 566, 746, 885, 99, 424, 442, 459, 571, 613, 689, 807, 84, 106, 257, 883, 191, 222, 265, 631, 681, 853, 207, 296, 546, 726, 866, 161, 665, 640, 816, 160, 669, 284, 313, 371, 973, 270, 407, 748, 230, 410, 445, 587, 644, 651, 936, 961, 964, 320, 804, 284, 476, 506, 755, 919, 57, 730, 754, 85, 259, 609, 975, 59, 278, 360, 413, 454, 589, 609, 889, 170, 184, 215, 365, 426, 707, 828, 548, 294, 479, 671, 473, 497, 642, 914, 942, 999, 65, 358, 491, 669, 326, 310, 531, 717, 852, 882, 960, 172, 331, 416, 552, 643, 453, 607, 732, 994, 245, 428, 943, 97, 255, 279, 570, 238, 412, 235, 271, 532, 722, 927, 964, 77, 286, 638, 736, 216, 351, 526, 745, 911, 927, 971, 79, 970, 839, 82, 814, 288, 556, 595, 456, 560, 563, 976, 98, 249, 85, 802, 149, 661, 86, 788, 953, 312, 630, 696, 740, 793, 88, 88, 88, 682, 864, 400, 424, 460, 947, 208, 354, 683, 744, 467, 946, 195, 92, 932, 402, 411, 828, 321, 670, 95, 95, 931, 766, 129, 458, 97, 968, 116, 161, 336, 422, 493, 802, 877, 281, 442, 651, 826, 907, 100, 100, 352, 364, 414, 190, 618, 839, 914, 924, 103, 887, 635, 961, 182, 772, 380, 488, 510, 662, 670, 520, 107, 109, 584, 597, 267, 388, 338, 862, 933, 691, 998, 113, 763, 475, 506, 790, 835, 898, 420, 494, 819, 116, 440, 535, 692, 886, 117, 117, 437, 661, 853, 876, 877, 219, 118, 798, 156, 672, 731, 766, 825, 120, 972, 981, 551, 842, 963, 122, 122, 349, 527, 529, 650, 770, 122, 150, 354, 377, 811, 922, 617, 130, 190, 337, 523, 742, 880, 409, 539, 126, 127, 937, 152, 159, 448, 527, 591, 317, 129, 973, 130, 702, 831, 324, 472, 492, 659, 436, 168, 172, 515, 538, 134, 329, 135, 714, 136, 706, 137, 241, 912, 140, 141, 660, 142, 142, 470, 735, 773, 142, 282, 705, 930, 300, 625, 715, 144, 144, 319, 774, 610, 741, 375, 148, 909, 383, 585, 593, 149, 718, 874, 894, 150, 178, 545, 873, 152, 487, 768, 999, 868, 995, 269, 560, 860, 156, 156, 645, 425, 849, 158, 158, 159, 342, 407, 595, 620, 849, 514, 645, 553, 802, 162, 549, 981, 239, 540, 864, 164, 537, 795, 800, 391, 165, 630, 761, 891, 333, 427, 581, 982, 855, 907, 398, 515, 169, 169, 169, 169, 184, 170, 890, 176, 356, 613, 679, 884, 426, 796, 173, 173, 174, 211, 274, 403, 734, 812, 511, 302, 363, 204, 246, 251, 283, 465, 557, 664, 704, 891, 178, 884, 915, 242, 302, 637, 273, 180, 289, 587, 696, 569, 671, 831, 305, 789, 277, 187, 341, 188, 188, 222, 439, 590, 620, 867, 880, 278, 356, 968, 989, 192, 419, 464, 673, 194, 194, 372, 403, 863, 196, 810, 197, 601, 666, 198, 401, 716, 719, 771, 594, 200, 756, 381, 666, 688, 788, 867, 202, 979, 498, 525, 898, 253, 539, 818, 859, 236, 448, 790, 880, 208, 879, 906, 210, 210, 261, 384, 545, 211, 211, 817, 346, 443, 451, 212, 752, 947, 213, 213, 674, 300, 322, 543, 671)
elTo <- c(elTo, 881, 216, 217, 218, 218, 239, 220, 220, 220, 649, 957, 265, 269, 222, 918, 920, 989, 675, 692, 809, 394, 463, 854, 225, 225, 658, 960, 307, 975, 695, 791, 811, 905, 230, 449, 452, 556, 231, 232, 769, 886, 234, 700, 235, 236, 236, 236, 236, 929, 503, 769, 822, 905, 517, 239, 239, 239, 297, 240, 808, 241, 769, 242, 242, 374, 243, 744, 930, 244, 244, 251, 642, 922, 246, 247, 291, 444, 247, 690, 248, 663, 944, 632, 249, 515, 769, 986, 334, 335, 465, 801, 251, 252, 252, 870, 322, 402, 721, 254, 254, 627, 852, 256, 256, 256, 987, 257, 909, 996, 258, 258, 258, 956, 996, 392, 845, 259, 260, 567, 600, 769, 916, 261, 781, 261, 306, 319, 323, 776, 263, 263, 264, 420, 795, 265, 862, 266, 266, 958, 876, 602, 743, 862, 888, 911, 441, 271, 324, 608, 271, 272, 437, 564, 600, 824, 274, 699, 743, 974, 277, 593, 821, 724, 865, 897, 280, 280, 992, 774, 281, 991, 282, 705, 398, 476, 730, 919, 746, 758, 823, 342, 635, 800, 821, 370, 390, 420, 556, 861, 779, 324, 291, 648, 291, 688, 291, 292, 954, 953, 296, 533, 577, 296, 849, 357, 596, 668, 715, 732, 881, 301, 753, 369, 419, 518, 987, 500, 655, 672, 825, 304, 304, 919, 653, 305, 434, 519, 540, 612, 894, 309, 485, 691, 311, 316, 617, 557, 596, 621, 314, 314, 314, 314, 314, 315, 316, 856, 537, 714, 895, 319, 361, 391, 534, 320, 804, 320, 590, 670, 875, 985, 447, 733, 812, 363, 655, 325, 326, 836, 328, 492, 621, 673, 984, 330, 901, 333, 333, 333, 333, 334, 417, 588, 966, 335, 589, 954, 614, 682, 802, 337, 338, 340, 368, 342, 342, 342, 588, 716, 892, 344, 344, 980, 345, 345, 346, 346, 834, 558, 623, 489, 559, 632, 850, 359, 588, 770, 351, 849, 971, 352, 352, 603, 854, 353, 819, 843, 856, 789, 823, 357, 536, 561, 359, 359, 359, 360, 884, 942, 450, 558, 701, 918, 361, 362, 461, 364, 580, 598, 685, 365, 366, 366, 367, 713, 499, 571, 700, 390, 370, 753, 921, 994, 863, 373, 471, 373, 508, 942, 435, 377, 645, 917, 948, 379, 379, 680, 382, 382, 382, 579, 768, 383, 383, 738, 837, 385, 386, 501, 636, 435, 462, 499, 606, 794, 923, 392, 845, 870, 394, 866, 987, 396, 909, 856, 758, 399, 400, 417, 722, 823, 721, 419, 403, 550, 405, 547, 886, 934, 999, 486, 870, 408, 470, 581, 600, 604, 502, 961, 964, 430, 441, 897, 803, 897, 628, 833, 415, 798, 908, 417, 588, 966, 984, 419, 861, 420, 983, 985, 479, 422, 777, 913, 954, 887, 796, 428, 429, 578, 652, 667, 429, 739, 808, 759, 431, 431, 452, 433, 920, 554, 923, 580, 673, 436, 445, 743, 437, 437, 480, 647, 440, 505, 440, 709, 440, 603, 441, 772, 441, 777, 878, 752, 668, 446, 955, 448, 577, 450, 450, 450, 452, 607, 678, 787, 453, 806, 980, 455, 455, 455, 468, 584, 457, 458, 530, 458, 538, 459, 902, 979, 461, 461, 462, 462, 463, 466, 939, 797, 813, 468, 468, 469, 469, 905, 949, 471, 516, 949, 701, 800, 521, 530, 473, 644, 759, 987, 475, 475, 495, 477, 477, 799, 676, 479, 507, 605, 634, 481, 481, 929, 610, 484, 484, 638, 682, 544, 571, 488, 918, 490, 510, 492, 950, 773, 634, 876, 495, 829, 955, 497, 497, 833, 497, 652, 606, 696, 767, 655, 825, 500, 501, 501, 938, 502, 502, 502, 558, 949, 504, 571, 724, 766, 790, 508, 508, 509, 615, 511, 654, 513, 513, 513, 516, 517, 517, 518, 598, 518, 518, 519, 520, 617, 622, 733, 952, 567, 524, 524, 528, 541, 820, 528, 699, 575, 903, 530, 697, 871, 882, 565, 575, 590, 682, 968, 533, 622, 753)
elTo <- c(elTo, 533, 903, 923, 980, 535, 926, 970, 536, 791, 600, 709, 934, 544, 540, 541, 703, 866, 944, 541, 701, 799, 824, 543, 850, 547, 786, 606, 551, 554, 554, 827, 939, 711, 555, 729, 855, 872, 966, 621, 558, 559, 559, 559, 582, 636, 998, 562, 562, 562, 562, 562, 563, 604, 651, 564, 786, 844, 782, 836, 654, 810, 612, 769, 916, 910, 777, 570, 573, 573, 575, 576, 576, 656, 578, 578, 579, 579, 579, 579, 582, 606, 968, 584, 851, 857, 636, 588, 589, 711, 967, 590, 677, 593, 594, 594, 825, 981, 621, 596, 598, 598, 599, 599, 600, 600, 934, 601, 875, 602, 603, 603, 604, 798, 849, 940, 993, 902, 967, 611, 967, 611, 611, 843, 686, 618, 618, 619, 764, 876, 619, 916, 926, 622, 778, 622, 703, 720, 819, 624, 624, 625, 626, 626, 627, 627, 627, 629, 629, 740, 789, 892, 779, 634, 765, 813, 728, 821, 873, 644, 637, 638, 895, 934, 640, 694, 643, 753, 861, 987, 708, 647, 648, 957, 722, 905, 650, 786, 844, 654, 656, 656, 657, 658, 658, 852, 786, 662, 664, 751, 817, 694, 711, 985, 786, 734, 831, 672, 825, 673, 674, 725, 775, 798, 945, 749, 884, 886, 679, 931, 754, 685, 814, 686, 687, 783, 688, 942, 695, 790, 809, 833, 910, 695, 697, 725, 699, 699, 699, 699, 928, 919, 777, 708, 709, 749, 935, 715, 935, 716, 852, 937, 719, 720, 721, 722, 905, 723, 723, 829, 865, 865, 966, 728, 881, 796, 755, 737, 740, 926, 740, 830, 974, 753, 803, 745, 791, 749, 754, 757, 758, 760, 762, 764, 765, 846, 873, 775, 775, 776, 977, 781, 781, 782, 785, 786, 788, 876, 789, 887, 791, 905, 792, 857, 858, 810, 869, 804, 990, 806, 875, 928, 955, 810, 811, 884, 811, 843, 819, 820, 821, 959, 991, 827, 829, 830, 831, 831, 832, 928, 965, 840, 857, 870, 872, 910, 860, 985, 941, 985, 887, 969, 977, 954, 904, 903, 949, 905, 933, 910, 910, 912, 921, 937, 1000, 940, 925, 926, 928, 933, 934, 940, 940, 966, 966)
edgeList <- data.frame(from=elFrom,
to=elTo,
stringsAsFactors = FALSE
)
# Inspect edgeList
str(edgeList)
## 'data.frame': 1663 obs. of 2 variables:
## $ from: num 1 1 1 1 1 1 2 2 2 2 ...
## $ to : num 250 308 413 525 803 894 332 433 474 847 ...
head(edgeList)
## from to
## 1 1 250
## 2 1 308
## 3 1 413
## 4 1 525
## 5 1 803
## 6 1 894
# Construct the igraph object
network <- graph_from_data_frame(edgeList, directed = FALSE)
# View your igraph object
network
## IGRAPH 0ee1f16 UN-- 956 1663 --
## + attr: name (v/c)
## + edges from 0ee1f16 (vertex names):
## [1] 1 --250 1 --308 1 --413 1 --525 1 --803 1 --894 2 --332 2 --433 2 --474
## [10] 2 --847 2 --963 2 --968 3 --147 3 --290 3 --337 3 --393 3 --474 4 --179
## [19] 4 --193 4 --233 5 --737 5 --793 5 --838 6 --684 6 --718 7 --237 7 --404
## [28] 8 --698 8 --724 10--285 10--641 11--86 11--285 11--376 11--689 11--758
## [37] 11--889 12--145 12--410 12--544 13--583 13--835 14--96 14--788 14--924
## [46] 15--43 15--91 15--446 16--181 16--289 16--378 16--406 16--547 16--784
## [55] 17--189 17--399 17--482 17--822 18--262 18--308 18--817 18--832 19--260
## [64] 19--997 20--81 20--229 20--839 21--56 21--840 22--183 23--186 23--397
## + ... omitted several edges
# load("./RInputFiles/StudentCustomers.RData")
custID <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651)
custID <- c(custID, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956)
custChurn <- c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
custChurn <- c(custChurn, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
customers <- data.frame(id=custID, churn=custChurn)
# Inspect the customers dataframe
str(customers)
## 'data.frame': 956 obs. of 2 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ churn: num 0 0 0 0 0 0 0 0 0 0 ...
head(customers)
## id churn
## 1 1 0
## 2 2 0
## 3 3 0
## 4 4 0
## 5 5 0
## 6 6 0
# Count the number of churners and non-churners
table(customers$churn)
##
## 0 1
## 926 30
matchID <- match(V(network), customers$id)
churnID <- customers$churn[matchID]
table(churnID)
## churnID
## 0 1
## 926 30
churnID[is.na(churnID)] <- 0
table(churnID)
## churnID
## 0 1
## 926 30
# Add a node attribute called churn
V(network)$churn <- churnID
# useVerts <- c('1', '10', '100', '1000', '101', '102', '103', '104', '105', '106', '107', '109', '11', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '12', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '13', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '14', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '15', '150', '152', '153', '154', '155', '156', '157', '158', '159', '16', '160', '161', '162', '163', '164', '165', '166', '167', '168', '169', '17', '170', '171', '172', '173', '174', '175', '176', '177', '178', '179', '18', '180', '181', '182', '183', '184', '185', '186', '187', '188', '189', '19', '190', '191', '192', '193', '194', '195', '196', '197', '198', '199', '2', '20', '200', '201', '202', '204', '205', '206', '207', '208', '209', '21', '210', '211', '212', '213', '214', '215', '216', '217', '218', '219', '22', '220', '221', '222', '223', '224', '225', '227', '228', '229', '23', '230', '231', '232', '233', '234', '235', '236', '237', '238', '239', '24', '240', '241', '242', '243', '244', '245', '246', '247', '248', '249', '25', '250', '251', '252', '253', '254', '255', '256', '257', '258', '259', '26', '260', '261', '262', '263', '264', '265', '266', '267', '269', '27', '270', '271', '272', '273', '274', '276', '277', '278', '279', '28', '280', '281', '282', '283', '284', '285', '286', '287', '288', '289', '29', '290', '291', '292', '293', '294', '296', '297', '299', '3', '300', '301', '302', '303', '304', '305', '306', '307', '308', '309', '310', '311', '312', '313', '314', '315', '316', '317', '318', '319', '32', '320', '321', '322', '323', '324', '325', '326', '327', '328', '329', '330', '331', '332', '333', '334', '335', '336', '337', '338', '34', '340', '341', '342', '343', '344', '345', '346', '347', '348', '349', '35', '351', '352', '353', '354', '356', '357', '358', '359', '360', '361', '362', '363', '364', '365', '366', '367', '368', '369', '37', '370', '371', '372', '373', '374', '375', '376', '377', '378', '379', '38', '380', '381', '382', '383', '384', '385', '386', '388', '39', '390', '391', '392', '393', '394', '395', '396', '397', '398', '399', '4', '40', '400', '401', '402', '403', '404', '405', '406', '407', '408', '409', '41', '410', '411', '412', '413', '414', '415', '416', '417', '419', '42', '420', '422', '423', '424', '425', '426', '427', '428', '429', '43', '430', '431', '433', '434', '435', '436', '437', '438', '439', '44', '440', '441', '442', '443', '444', '445', '446', '447', '448', '449', '45', '450', '451', '452', '453', '454', '455', '456', '457', '458', '459', '460', '461', '462', '463', '464', '465', '466', '467', '468', '469', '47', '470', '471', '472', '473', '474', '475', '476', '477', '478', '479', '48', '480', '481', '482', '483', '484', '485', '486', '487', '488', '489', '49', '490', '491', '492', '493', '494', '495', '496', '497', '498', '499')
# useVerts <- c(useVerts, '5', '50', '500', '501', '502', '503', '504', '505', '506', '507', '508', '509', '51', '510', '511', '513', '514', '515', '516', '517', '518', '519', '52', '520', '521', '522', '523', '524', '525', '526', '527', '528', '529', '53', '530', '531', '532', '533', '534', '535', '536', '537', '538', '539', '54', '540', '541', '543', '544', '545', '546', '547', '548', '549', '55', '550', '551', '552', '553', '554', '555', '556', '557', '558', '559', '56', '560', '561', '562', '563', '564', '565', '566', '567', '568', '569', '57', '570', '571', '573', '575', '576', '577', '578', '579', '58', '580', '581', '582', '583', '584', '585', '587', '588', '589', '59', '590', '591', '592', '593', '594', '595', '596', '597', '598', '599', '6', '60', '600', '601', '602', '603', '604', '605', '606', '607', '608', '609', '61', '610', '611', '612', '613', '614', '615', '617', '618', '619', '62', '620', '621', '622', '623', '624', '625', '626', '627', '628', '629', '63', '630', '631', '632', '633', '634', '635', '636', '637', '638', '64', '640', '641', '642', '643', '644', '645', '646', '647', '648', '649', '65', '650', '651', '652', '653', '654', '655', '656', '657', '658', '659', '66', '660', '661', '662', '663', '664', '665', '666', '667', '668', '669', '67', '670', '671', '672', '673', '674', '675', '676', '677', '678', '679', '68', '680', '681', '682', '683', '684', '685', '686', '687', '688', '689', '69', '690', '691', '692', '694', '695', '696', '697', '698', '699', '7', '700', '701', '702', '703', '704', '705', '706', '707', '708', '709', '71', '711', '713', '714', '715', '716', '717', '718', '719', '72', '720', '721', '722', '723', '724', '725', '726', '728', '729', '73', '730', '731', '732', '733', '734', '735', '736', '737', '738', '739', '74', '740', '741', '742', '743', '744', '745', '746', '748', '749', '75', '751', '752', '753', '754', '755', '756', '757', '758', '759', '76', '760', '761', '762', '763', '764', '765', '766', '767', '768', '769', '77', '770', '771', '772', '773', '774', '775', '776', '777', '778', '779', '78', '780', '781', '782', '783', '784', '785', '786', '787', '788', '789', '79', '790', '791', '792', '793', '794', '795', '796', '797', '798', '799', '8', '80', '800', '801', '802', '803', '804', '805', '806', '807', '808', '809', '81', '810', '811', '812', '813', '814', '816', '817', '818', '819', '82', '820', '821', '822', '823', '824', '825', '826', '827', '828', '829', '83', '830', '831', '832', '833', '834', '835', '836', '837', '838', '839', '84', '840', '842', '843', '844', '845', '846', '847', '849', '85', '850', '851', '852', '853', '854', '855', '856', '857', '858', '859', '86', '860', '861', '862', '863', '864', '865', '866', '867', '868', '869', '87', '870', '871', '872', '873', '874', '875', '876', '877', '878', '879', '88', '880', '881', '882', '883', '884', '885', '886', '887', '888', '889', '89', '890', '891', '892', '894', '895', '896', '897', '898', '90', '901', '902', '903', '904', '905', '906', '907', '908', '909', '91', '910', '911', '912', '913', '914', '915', '916', '917', '918', '919', '92', '920', '921', '922', '923', '924', '925', '926', '927', '928', '929', '93', '930', '931', '932', '933', '934', '935', '936', '937', '938', '939', '94', '940', '941', '942', '943', '944', '945', '946', '947', '948', '949', '95', '950', '951', '952', '953', '954', '955', '956', '957', '958', '959', '96', '960', '961', '963', '964', '965', '966', '967', '968', '969', '97', '970', '971', '972', '973', '974', '975', '976', '977', '979', '98', '980', '981', '982', '983', '984', '985', '986', '987', '988', '989', '99', '990', '991', '992', '993', '994', '995', '996', '997', '998', '999')
# useVertNums <- match(useVerts, V(network))
# useNetwork <- induced_subgraph(network, useVertNums)
useNetwork <- network
useNetwork
## IGRAPH 0ee1f16 UN-- 956 1663 --
## + attr: name (v/c), churn (v/n)
## + edges from 0ee1f16 (vertex names):
## [1] 1 --250 1 --308 1 --413 1 --525 1 --803 1 --894 2 --332 2 --433 2 --474
## [10] 2 --847 2 --963 2 --968 3 --147 3 --290 3 --337 3 --393 3 --474 4 --179
## [19] 4 --193 4 --233 5 --737 5 --793 5 --838 6 --684 6 --718 7 --237 7 --404
## [28] 8 --698 8 --724 10--285 10--641 11--86 11--285 11--376 11--689 11--758
## [37] 11--889 12--145 12--410 12--544 13--583 13--835 14--96 14--788 14--924
## [46] 15--43 15--91 15--446 16--181 16--289 16--378 16--406 16--547 16--784
## [55] 17--189 17--399 17--482 17--822 18--262 18--308 18--817 18--832 19--260
## [64] 19--997 20--81 20--229 20--839 21--56 21--840 22--183 23--186 23--397
## + ... omitted several edges
# Visualize the network (pretty messy)
plot(useNetwork, vertex.label = NA, edge.label = NA, edge.color = 'black', vertex.size = 2)
# Add a node attribute called color
V(useNetwork)$color <- V(useNetwork)$churn
# Change the color of churners to red and non-churners to white
V(useNetwork)$color <- gsub("1", "red", V(useNetwork)$color)
V(useNetwork)$color <- gsub("0", "white", V(useNetwork)$color)
# Plot the network (pretty messy)
plot(useNetwork, vertex.label = NA, edge.label = NA, edge.color = 'black', vertex.size = 2)
# Create a subgraph with only churners
churnerNetwork <- induced_subgraph(useNetwork, v = V(useNetwork)[which(V(useNetwork)$churn == 1)])
# Plot the churner network
plot(churnerNetwork, vertex.label = NA, vertex.size = 2)
ctNeighbors <- function(v) {
tmp <- V(useNetwork)[neighbors(useNetwork, v, mode="all")]$churn
c(sum(tmp==0), sum(tmp==1))
}
mtxNeighbors <- sapply(V(useNetwork), FUN=ctNeighbors)
NonChurnNeighbors <- mtxNeighbors[1, ]
ChurnNeighbors <- mtxNeighbors[2, ]
# Compute the churn probabilities
churnProb <- ChurnNeighbors / (ChurnNeighbors + NonChurnNeighbors)
# Find who is most likely to churn
mostLikelyChurners <- which(churnProb == max(churnProb))
# Extract the IDs of the most likely churners
customers$id[mostLikelyChurners]
## [1] 21 729 764 922
# Find churn probability of the 44th customer
churnProb[44]
## 49
## 0
# Update the churn probabilties and the non-churn probabilities
AdjacencyMatrix <- as_adjacency_matrix(useNetwork)
nNeighbors <- colSums(mtxNeighbors)
churnProb_updated <- as.vector((AdjacencyMatrix %*% churnProb) / nNeighbors)
# Find updated churn probability of the 44th customer
churnProb_updated[44]
## [1] 0.3333333
# Compute the AUC
pROC::auc(churnID, as.vector(churnProb))
## Registered S3 methods overwritten by 'pROC':
## method from
## print.roc huge
## plot.roc huge
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Area under the curve: 0.5091
# Write a for loop to update the probabilities
for(i in 1:10){
churnProb <- as.vector((AdjacencyMatrix %*% churnProb) / nNeighbors)
}
# Compute the AUC again
pROC::auc(churnID, as.vector(churnProb))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Area under the curve: 0.6753
Chapter 2 - Homophily
Homophily:
Dyadicity:
Heterophilicity:
Summary:
Example code includes:
# Add the column edgeList$FromLabel
edgeList$FromLabel <- customers[match(edgeList$from, customers$id), 2]
edgeList$FromLabel[is.na(edgeList$FromLabel)] <- 0
# Add the column edgeList$ToLabel
edgeList$ToLabel <- customers[match(edgeList$to, customers$id), 2]
edgeList$ToLabel[is.na(edgeList$ToLabel)] <- 0
# Add the column edgeList$edgeType
edgeList$edgeType <- edgeList$FromLabel + edgeList$ToLabel
# Count the number of each type of edge
table(edgeList$edgeType)
##
## 0 1 2
## 1544 117 2
# Count churn edges
ChurnEdges <- sum(edgeList$edgeType == 2)
# Count non-churn edges
NonChurnEdges <- sum(edgeList$edgeType == 0)
# Count mixed edges
MixedEdges <- sum(edgeList$edgeType == 1)
# Count all edges
edges <- ChurnEdges + NonChurnEdges + MixedEdges
#Print hte number of edges
edges
## [1] 1663
# Count the number of churn nodes
ChurnNodes <- sum(customers$churn == 1)
# Count the number of non-churn nodes
NonChurnNodes <- sum(customers$churn == 0)
# Count the total number of nodes
nodes <- ChurnNodes + NonChurnNodes
# Compute the network connectance
connectance <- 2 * edges / nodes / (nodes - 1)
# Print the value
connectance
## [1] 0.003643015
# Compute the expected churn dyadicity
ExpectedDyadChurn <- ChurnNodes * (ChurnNodes - 1) * connectance / 2
# Compute the churn dyadicity
DyadChurn <- ChurnEdges / ExpectedDyadChurn
# Inspect the value
DyadChurn
## [1] 1.262059
# Compute the expected heterophilicity
ExpectedHet <- NonChurnNodes * ChurnNodes * connectance
# Compute the heterophilicity
Het <- MixedEdges / ExpectedHet
# Inspect the heterophilicity
Het
## [1] 1.156093
Chapter 3 - Network Featurization
Basic Network Features:
Link-Based Features:
Page Rank:
Example code includes:
# Extract network degree
V(network)$degree <- degree(network, normalized=TRUE)
# Extraxt 2.order network degree
degree2 <- neighborhood.size(network, 2)
# Normalize 2.order network degree
V(network)$degree2 <- degree2 / (length(V(network)) - 1)
# Extract number of triangles
V(network)$triangles <- count_triangles(network)
# Extract the betweenness
V(network)$betweenness <- betweenness(network, normalized=TRUE)
# Extract the closeness
V(network)$closeness <- closeness(network, normalized=TRUE)
## Warning in closeness(network, normalized = TRUE): At centrality.c:
## 2784 :closeness centrality is not well-defined for disconnected graphs
# Extract the eigenvector centrality
V(network)$eigenCentrality <- eigen_centrality(network, scale = TRUE)$vector
# Extract the local transitivity
V(network)$transitivity <- transitivity(network, type="local", isolates='zero')
# Compute the network's transitivity
transitivity(network)
## [1] 0.1002653
# Extract the adjacency matrix
AdjacencyMatrix <- as_adjacency_matrix(network)
# Compute the second order matrix
SecondOrderMatrix <- AdjacencyMatrix %*% AdjacencyMatrix
# Adjust the second order matrix
SecondOrderMatrix_adj <- ((SecondOrderMatrix) > 0) + 0
diag(SecondOrderMatrix_adj) <- 0
# Inspect the second order matrix
SecondOrderMatrix_adj[1:10, 1:10]
## 10 x 10 sparse Matrix of class "dgCMatrix"
## [[ suppressing 10 column names '1', '2', '3' ... ]]
##
## 1 0 . . . . . . . . .
## 2 . 0 1 . . . . . . .
## 3 . 1 0 . . . . . . .
## 4 . . . 0 . . . . . .
## 5 . . . . 0 . . . . .
## 6 . . . . . 0 . . . .
## 7 . . . . . . 0 . . .
## 8 . . . . . . . 0 . .
## 10 . . . . . . . . 0 1
## 11 . . . . . . . . 1 0
# Compute the number of churn neighbors
V(network)$ChurnNeighbors <- as.vector(AdjacencyMatrix %*% V(network)$churn)
# Compute the number of non-churn neighbors
V(network)$NonChurnNeighbors <- as.vector(AdjacencyMatrix %*% (1 - V(network)$churn))
# Compute the relational neighbor probability
V(network)$RelationalNeighbor <- as.vector(V(network)$ChurnNeighbors /
(V(network)$ChurnNeighbors + V(network)$NonChurnNeighbors))
# Compute the number of churners in the second order neighborhood
V(network)$ChurnNeighbors2 <- as.vector(SecondOrderMatrix %*% V(network)$churn)
# Compute the number of non-churners in the second order neighborhood
V(network)$NonChurnNeighbors2 <- as.vector(SecondOrderMatrix %*% (1 - V(network)$churn))
# Compute the relational neighbor probability in the second order neighborhood
V(network)$RelationalNeighbor2 <- as.vector(V(network)$ChurnNeighbors2 /
(V(network)$ChurnNeighbors2 + V(network)$NonChurnNeighbors2))
degree <- degree(network)
# Extract the average degree of neighboring nodes
V(network)$averageDegree <-
as.vector(AdjacencyMatrix %*% V(network)$degree) / degree
# Extract the average number of triangles of neighboring nodes
V(network)$averageTriangles <-
as.vector(AdjacencyMatrix %*% V(network)$triangles) / degree
# Extract the average transitivity of neighboring nodes
V(network)$averageTransitivity <-
as.vector(AdjacencyMatrix %*% V(network)$transitivity) / degree
# Extract the average betweeness of neighboring nodes
V(network)$averageBetweenness <-
as.vector(AdjacencyMatrix %*% V(network)$betweenness) / degree
# Compute one iteration of PageRank
# iter1 <- page.rank(network, algo = 'power', options = list(niter = 1))$vector
# Compute two iterations of PageRank
# iter2 <- page.rank(network, algo = 'power', options = list(niter = 2))$vector
# Inspect the change between one and two iterations
# sum(abs(iter1 - iter2))
# Inspect the change between nine and ten iterations
# sum(abs(iter9 - iter10))
# Create an empty vector
# value <- c()
# Write a loop to compute PageRank
# for(i in 1:15){
# value <- cbind(value, page.rank(network, algo = 'power',options = list(niter = i))$vector)
# }
# Compute the differences
# difference <- colSums(abs(value[,1:14] - value[,2:15]))
# Plot the differences
# plot(1:14, difference)
# boxplots <- function(damping=0.85, personalized=FALSE){
# if(personalized){
# V(network)$pp<-page.rank(network,damping=damping,personalized = V(network)$Churn)$vector
# }
# else{
# V(network)$pp<-page.rank(network,damping=damping)$vector
# }
# boxplot(V(network)$pp~V(network)$Churn)#
# }
# Look at the distribution of standard PageRank scores
# boxplots(damping = 0.85)
# Inspect the distribution of personalized PageRank scores
# boxplots(damping = 0.85, personalized = TRUE)
# Look at the standard PageRank with damping factor 0.2
# boxplots(damping = 0.2)
# Inspect the personalized PageRank scores with a damping factor 0.99
# boxplots(damping=0.99, personalized = TRUE)
# Compute the default PageRank score
# V(network)$pr_0.85 <- page.rank(network)$vector
# Compute the PageRank score with damping 0.2
# V(network)$pr_0.20 <- page.rank(network, damping=0.2)$vector
# Compute the personalized PageRank score
# V(network)$perspr_0.85 <- page.rank(network, damping=0.85, personalized = V(network)$Churn)$vector
# Compute the personalized PageRank score with damping 0.99
# V(network)$perspr_0.99 <- page.rank(network, damping=0.99, personalized = V(network)$Churn)$vector
Chapter 4 - Putting It All Together
Extract Dataset:
Building Predictive Models:
Evaluating Model Performance:
Wrap Up:
Example code includes:
# Extract the dataset
dataset_full <- as_data_frame(network, what = "vertices")
dataset_full$Future <- 0
dsF1 <- c(404, 550, 41, 613, 48, 230, 294, 852, 93, 520, 617, 523, 714, 282, 705, 153, 995, 511, 204, 273, 194, 756, 979, 879, 843, 713, 837, 636, 469, 478, 938, 654, 751, 775)
dataset_full[match(dsF1, dataset_full$name), "Future"] <- 1
# Inspect the dataset
head(dataset_full)
## name churn degree degree2 triangles betweenness closeness
## 1 1 0 0.006282723 0.021989529 2 0.008143888 0.08535931
## 2 2 0 0.006282723 0.030366492 0 0.019841794 0.08965452
## 3 3 0 0.005235602 0.019895288 0 0.008281176 0.08587357
## 4 4 0 0.003141361 0.012565445 0 0.003340597 0.08377928
## 5 5 0 0.003141361 0.008376963 1 0.002081070 0.07731541
## 6 6 0 0.002094241 0.010471204 0 0.001766448 0.08158209
## eigenCentrality transitivity ChurnNeighbors NonChurnNeighbors
## 1 0.16674956 0.1333333 0 6
## 2 0.25675402 0.0000000 0 6
## 3 0.10174163 0.0000000 0 5
## 4 0.06977430 0.0000000 0 3
## 5 0.01838209 0.3333333 0 3
## 6 0.06214509 0.0000000 0 2
## RelationalNeighbor ChurnNeighbors2 NonChurnNeighbors2 RelationalNeighbor2
## 1 0 3 22 0.1200000
## 2 0 0 28 0.0000000
## 3 0 2 16 0.1111111
## 4 0 0 11 0.0000000
## 5 0 0 9 0.0000000
## 6 0 0 9 0.0000000
## averageDegree averageTriangles averageTransitivity averageBetweenness Future
## 1 0.004363002 0.8333333 0.13888889 0.005713676 0
## 2 0.004886562 0.5000000 0.03888889 0.013126033 0
## 3 0.003769634 0.6000000 0.08666667 0.005525569 0
## 4 0.003839442 0.0000000 0.00000000 0.005168532 0
## 5 0.003141361 0.6666667 0.11111111 0.003089281 0
## 6 0.004712042 1.0000000 0.13333333 0.005718188 0
# Remove customers who already churned
dataset_filtered <- dataset_full[-which(dataset_full$churn == 1), ]
# Remove useless columns
dataset <- dataset_filtered[, -c(1, 2)]
# Inspect the feature
summary(dataset$RelationalNeighbor2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.02509 0.04000 0.33333
# Find the indeces of the missing values
toReplace <- which(is.na(dataset$RelationalNeighbor2))
# Replace the missing values with 0
dataset$RelationalNeighbor2[toReplace] <- 0
# Inspect the feature again
summary(dataset$RelationalNeighbor2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.02509 0.04000 0.33333
# Generate the correlation matrix
M <- cor(dataset[,])
# Plot the correlations
corrplot::corrplot(M, method = "circle")
# Print the column names
colnames(dataset)
## [1] "degree" "degree2" "triangles"
## [4] "betweenness" "closeness" "eigenCentrality"
## [7] "transitivity" "ChurnNeighbors" "NonChurnNeighbors"
## [10] "RelationalNeighbor" "ChurnNeighbors2" "NonChurnNeighbors2"
## [13] "RelationalNeighbor2" "averageDegree" "averageTriangles"
## [16] "averageTransitivity" "averageBetweenness" "Future"
# Create toRemove
# toRemove <- c(10, 13, 19, 22)
# Remove the columns
# dataset <- dataset[, -toRemove]
# Set the seed
set.seed(7)
# Creat the index vector
index_train <- sample(1:nrow(dataset), round((2/3) * nrow(dataset), 0), replace=FALSE)
# Make the training set
training_set <- dataset[index_train,]
# Make the test set
test_set <- dataset[-index_train,]
# Make firstModel
firstModel <- glm(Future ~ degree + degree2 + triangles + betweenness + closeness + transitivity,
family = "binomial", data = training_set
)
# Build the model
secondModel <- glm(Future ~ ChurnNeighbors + RelationalNeighbor + ChurnNeighbors2 + RelationalNeighbor2 + averageDegree + averageTriangles + averageTransitivity + averageBetweenness,
family = "binomial", data = training_set
)
# Build the model
thirdModel <- glm(Future ~ ., data=training_set, family="binomial")
# Set seed
set.seed(863)
# Build model
rfModel <- randomForest::randomForest(as.factor(Future)~. ,data=training_set)
# Plot variable importance
randomForest::varImpPlot(rfModel)
# Predict with the first model
firstPredictions <- predict(firstModel, newdata = test_set, type = "response")
# Predict with the first model
secondPredictions <- predict(secondModel, newdata = test_set, type = "response")
# Predict with the first model
thirdPredictions <- predict(thirdModel, newdata = test_set, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
# Predict with the first model
rfPredictions<- predict(rfModel, newdata = test_set, type = "prob")
sapply(list(firstPredictions, secondPredictions, thirdPredictions, rfPredictions[, 2]),
FUN=function(x) { pROC::auc(test_set$Future, x) }
)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## [1] 0.7321538 0.6514643 0.7031727 0.4804759
Chapter 1 - Introduction to Bayesian Linear Models
Non-Bayesian Linear Regression:
Bayesian Linear Regression:
Comparing frequentist and Bayesian models:
Example code includes:
# Print the first 6 rows
head(songs)
# Print the structure
str(songs)
# Create the model here
lm_model <- lm(popularity ~ song_age, data = songs)
# Produce the summary
summary(lm_model)
# Print a tidy summary of the coefficients
tidy(lm_model)
# Create the model here
stan_model <- stan_glm(popularity ~ song_age, data = songs)
# Produce the summary
summary(stan_model)
# Print a tidy summary of the coefficients
tidy(stan_model)
# Create the 90% credible intervals
posterior_interval(stan_model)
# Create the 95% credible intervals
posterior_interval(stan_model, prob = 0.95)
# Create the 80% credible intervals
posterior_interval(stan_model, prob = 0.8)
Chapter 2 - Modifying a Bayesian Model
What is in a Bayesian Model?
Prior Distributions:
User-Specified Priors:
Tuning Models for Stability:
Example code includes:
# 3 chains, 1000 iterations, 500 warmup
model_3chains <- stan_glm(popularity ~ song_age, data = songs,
chains = 3, iter = 1000, warmup = 500)
# Print a summary of model_3chains
summary(model_3chains)
# 2 chains, 100 iterations, 50 warmup
model_2chains <- stan_glm(popularity ~ song_age, data = songs,
chains = 2, iter = 100, warmup = 50)
# Print a summary of model_1chain
summary(model_2chains)
# Estimate the model
stan_model <- stan_glm(popularity ~ song_age, data = songs)
# Print a summary of the prior distributions
prior_summary(stan_model)
# Calculate the adjusted scale for the intercept
10 * sd(songs$popularity)
# Calculate the adjusted scale for `song_age`
(2.5 / sd(songs$song_age)) * sd(songs$popularity)
# Calculate the adjusted scale for `valence`
(2.5 / sd(songs$valence)) * sd(songs$popularity)
# Estimate the model with unadjusted scales
no_scale <- stan_glm(popularity ~ song_age, data = songs,
prior_intercept = normal(autoscale = FALSE),
prior = normal(autoscale = FALSE),
prior_aux = exponential(autoscale = FALSE)
)
# Print the prior summary
prior_summary(no_scale)
# Estimate a model with flat priors
flat_prior <- stan_glm(popularity ~ song_age, data = songs,
prior_intercept = NULL, prior = NULL, prior_aux = NULL)
# Print a prior summary
prior_summary(flat_prior)
# Estimate the model with an informative prior
inform_prior <- stan_glm(popularity ~ song_age, data = songs,
prior = normal(location = 20, scale = 0.1, autoscale = FALSE))
# Print the prior summary
prior_summary(inform_prior)
# Estimate the model with a new `adapt_delta`
adapt_model <- stan_glm(popularity ~ song_age, data = songs,
control = list(adapt_delta = 0.99))
# View summary
summary(adapt_model)
# Estimate the model with a new `max_treedepth`
tree_model <- stan_glm(popularity ~ song_age, data = songs,
control = list(max_treedepth = 15))
# View summary
summary(tree_model)
Chapter 3 - Assessing Model Fit
Using R-Squared Statistics:
Posterior Predictive Model Checks:
(Intercept), mom_iq) %>% select(-.draw)Model Fit with Posterior Predictive Model Checks:
Bayesian Model Comparisons:
Example code includes:
# Print the R-squared from the linear model
lm_summary$r.squared
# Calulate sums of squares
ss_res <- var(residuals(lm_model))
ss_fit <- var(fitted(lm_model))
# Calculate the R-squared
1 - (ss_res / (ss_res + ss_fit))
# Save the variance of residulas
ss_res <- var(residuals(stan_model))
# Save the variance of fitted values
ss_fit <- var(fitted(stan_model))
# Calculate the R-squared
1 - (ss_res / (ss_res + ss_fit))
# Calculate posterior predictive scores
predictions <- posterior_linpred(stan_model)
# Print a summary of the observed data
summary(songs$popularity)
# Print a summary of the 1st replication
summary(predictions[1,])
# Print a summary of the 10th replication
summary(predictions[10,])
# Calculate the posterior distribution of the R-squared
r2_posterior <- bayes_R2(stan_model)
# Make a histogram of the distribution
hist(r2_posterior)
# Create density comparison
pp_check(stan_model, "dens_overlay")
# Create scatter plot of means and standard deviations
pp_check(stan_model, "stat_2d")
# Estimate the model with 1 predictor
model_1pred <- stan_glm(popularity ~ song_age, data = songs)
# Print the LOO estimate for the 1 predictor model
loo(model_1pred)
# Estimate the model with both predictors
model_2pred <- stan_glm(popularity ~ song_age * artist_name, data = songs)
# Print the LOO estimates for the 2 predictor model
loo(model_2pred)
Chapter 4 - Presenting and Using Bayesian Regression
Visualizing Bayesian Models:
(Intercept), mom_iq)(Intercept), slope = mom_iq), size = 0.2, alpha = 0.1, color = “skyblue”)(Intercept), slope = mom_iq), size = 0.2, alpha = 0.1, color = “skyblue”) + geom_abline(intercept = model_intercept, slope = model_slope)Making Predictions:
Visualizing Predictions:
Conclusion:
Example code includes:
# Save the model parameters
tidy_coef <- tidy(stan_model)
# Extract intercept and slope
model_intercept <- tidy_coef$estimate[1]
model_slope <- tidy_coef$estimate[2]
# Create the plot
ggplot(songs, aes(x = song_age, y = popularity)) +
geom_point() +
geom_abline(intercept = model_intercept, slope = model_slope)
# Save the values from each draw of the posterior distribution
draws <- spread_draws(stan_model, `(Intercept)`, `song_age`)
# Print the `draws` data frame to the console
draws
# Create the plot
ggplot(songs, aes(x = song_age, y = popularity)) +
geom_point()
# Create the plot
ggplot(songs, aes(x = song_age, y = popularity)) +
geom_point() +
geom_abline(data = draws, aes(intercept = `(Intercept)`, slope = song_age),
size = 0.1, alpha = 0.2, color = "skyblue"
)
# Create the plot
ggplot(songs, aes(x = song_age, y = popularity)) +
geom_point() +
geom_abline(data = draws, aes(intercept = `(Intercept)`, slope = song_age),
size = 0.1, alpha = 0.2, color = "skyblue"
) +
geom_abline(intercept = model_intercept, slope = model_slope)
# Estimate the regression model
stan_model <- stan_glm(popularity ~ song_age + artist_name, data = songs)
# Print the model summary
summary(stan_model)
# Get posteriors of predicted scores for each observation
posteriors <- posterior_predict(stan_model)
# Print 10 predicted scores for 5 songs
posteriors[1:10, 1:5]
# Create data frame of new data
predict_data <- data.frame(song_age = 663, artist_name = "Beyoncé")
# Create posterior predictions for Lemonade album
new_predictions <- posterior_predict(stan_model, newdata = predict_data)
# Print first 10 predictions for the new data
new_predictions[1:10, ]
# Print a summary of the posterior distribution of predicted popularity
summary(new_predictions[, 1])
# View new data predictions
new_predictions[1:10, ]
# Convert to data frame and rename variables
new_predictions <- as.data.frame(new_predictions)
colnames(new_predictions) <- c("Adele", "Taylor Swift", "Beyoncé")
# Create tidy data structure
plot_posterior <- gather(new_predictions, key = "artist_name", value = "predict")
# Print formated data
head(plot_posterior)
# Create plot of
ggplot(plot_posterior, aes(x = predict)) +
facet_wrap(~ artist_name, ncol = 1) +
geom_density()
Chapter 1 - Introduction to ChIP-seq
What is ChIP-seq?
ChIP-seq Workflow:
ChIP-seq Results Summary:
Example code includes:
# Print a summary of the 'reads' object
print(reads)
# Get the start position of the first read
start_first <- start(reads)[1]
# Get the end position of the last read
end_last <- end(reads)[length(reads)]
# Compute the number of reads covering each position in the selected region
cvg <- coverage(reads)
# Print a summary of the 'peaks' object
print(peaks)
# Use the score function to find the index of the highest scoring peak
max_idx <- which.max(score(peaks))
# Extract the genomic coordinates of the highest scoring peak using the `chrom` and `ranges` functions
max_peak_chrom <- chrom(peaks)[max_idx]
max_peak_range <- ranges(peaks)[max_idx]
# Create a vector of colors to label groups (there are 2 samples per group)
group <- c(primary = rep("blue", 2), TURP = rep("red", 2))
# Plot the sample correlation matrix `sample_cor` as a heat map
heatmap(sample_cor, ColSideColors = group, RowSideColors = group,
cexCol = 0.75, cexRow = 0.75, symm = TRUE)
# Create a heat map of peak read counts
heatmap(read_counts, ColSideColors = group, labRow = "", cexCol = 0.75)
# Take a look at the full gene sets
print(ar_sets)
# Visualise the overlap between the two groups using the `upset` function
upset(fromList(ar_sets))
# Print the genes with differential binding
print(db_sets)
# Visualise the overlap between the two groups using the `upset` function
upset(fromList(db_sets))
Chapter 2 - Back to Basics - Preparing ChIP-seq Data
Importing Data:
Closer Look at Peaks:
Cleaning ChIP-seq Data:
Assessing Enrichment:
Example code includes:
# Load reads form chr20_bam file
reads <- readGAlignments(chr20_bam)
# Create a `BamViews` object for the range 29805000 - 29820000 on chromosome 20
bam_views <- BamViews(chr20_bam, bamRanges=GRanges("chr20", IRanges(start=29805000, end=29820000)))
# Load only the reads in that view
reads_sub <- readGAlignments(bam_views)
# Print the `reads_sub` object
str(reads_sub)
# Load peak calls from chr20_peaks
peaks <- import.bed(chr20_peaks, genome="hg19")
# Create a BamViews object
bam_views <- BamViews(chr20_bam, bamRanges=peaks)
# Load the reads
reads <- readGAlignments(bam_views)
# Create tracks
peak_track <- AnnotationTrack(peak_calls, name="Peaks")
cover_track <- DataTrack(cover_ranges, window=10500, type="polygon", name="Coverage", fill.mountain=c("lighgrey", "lightgrey"), col.mountain="grey")
# Highlight peak locations across tracks
peak_highlight <- HighlightTrack(trackList = list(cover_track, peak_track), range = peak_calls)
# Produce plot
plotTracks(list(ideogram, peak_highlight, GenomeAxisTrack()), chromosome="chr20", from=start_pos, to=end_pos)
# Load reads with mapping qualities by requesting the "mapq" entries
reads <- readGAlignments(bam_file, param=ScanBamParam(what="mapq"))
# Identify good quality alignments
high_mapq <- mcols(reads)$mapq >= 20
# Examine mapping quality distribution for high and low quality alignments
boxplot(mcols(reads)$mapq ~ high_mapq, xlab="good quality alignments", ylab="mapping quality")
# Remove low quality alignments
reads_good <- subset(reads, high_mapq)
Chapter 3 - Comparing ChIP-seq Samples
Introduction to Differential Binding:
Testing for Differential Binding:
Closer Look at Differential Binding:
Example code includes:
# Compute the pairwise distances between samples using `dist`
cover_dist <- dist(t(cover))
# Use `hclust()` to create a dendrogram from the distance matrix
cover_dendro <- hclust(cover_dist)
# Plot the dendrogram
plot(cover_dendro)
# Print the `peaks` object
print(peaks)
# Obtain the coordinates of the merged peaks
merged_peaks <- peaks$merged
# Extract the number of peaks present in the data
peak_count <- nrow(merged_peaks)
# Create a heatmap using the `dba.plotHeatmap()` function
dba.plotHeatmap(peaks, maxSites = peak_count, correlations = FALSE)
# Examine the ar_binding object
print(ar_binding)
# Identify the category corresponding to the tumor type contrast
contrast <- DBA_CONDITION
# Establish the contrast to compare the two tumor types
ar_binding <- dba.contrast(ar_binding, categories=contrast, minMembers=2)
# Examine the ar_binding object again to confirm that the contrast has been added
print(ar_binding)
# Examine the `ar_binding` object to confirm that it contains the required contrast
print(ar_binding)
# Run the differential binding analysis
ar_diff <- dba.analyze(ar_binding)
# Examine the result
print(ar_diff)
# Create a PCA plot using all peaks
dba.plotPCA(ar_diff, DBA_CONDITION)
# Create a PCA plot using only differentially bound peaks
dba.plotPCA(ar_diff, DBA_CONDITION, contrast = 1)
# Create a heatmap using all peaks
dba.plotHeatmap(ar_diff, DBA_CONDITION, correlations = FALSE, maxSites = 440)
# Create a heatmap using only differentially bound peaks
dba.plotHeatmap(ar_diff, DBA_CONDITION, contrast=1, correlations = FALSE)
# Create an MA plot
dba.plotMA(ar_diff)
# Create a volcano plot
dba.plotVolcano(ar_diff)
# Create a box plot of the peak intensities
compare_groups <- dba.plotBox(ar_diff, notch=FALSE)
# Inspect the returned p-values
print(compare_groups)
Chapter 4 - From Peaks to Genes to Function
Interpreting ChIP-seq Peaks:
Interpreting Gene Lists:
Advanced ChIP-seq Analyses:
Example code includes:
# Extract peaks from ChIPQCexperiment object
peak_calls <- peaks(ar_calls)
# Only keep samples that passed QC
peak_passed <- peak_calls[qc_pass]
# Find overlaps between peak sets
peaks_combined <- findOverlapsOfPeaks(peak_passed[[1]], peak_passed[[2]],
peak_passed[[3]], peak_passed[[4]],
maxgap=50
)
# Examine merged peak set
print(peaks_combined)
# Annotate peaks with closest gene
peak_anno <- annoPeaks(peaks_merged, human_genes, bindingType="startSite", bindingRegion=c(-5000,5000))
# How many peaks were found close to genes?
length(peak_anno)
# Where are peaks located relative to genes?
table(peak_anno$insideFeature)
# Create Venn diagram
dba.plotVenn(ar_diff, mask=1:4)
# Convert the matrix of called peaks into a data frame
called_peaks <- as.data.frame(ar_diff$called)
# Create UpSet plot
upset(called_peaks, keep.order = TRUE, sets=colnames(ar_diff$called), order.by="freq")
# Select all peaks with higher intensity in treatment resistant samples
turp_peaks <- peaks_binding[, "GSM1598218"] + peaks_binding[, "GSM1598219"] < peaks_binding[, "GSM1598223"] + peaks_binding[, "GSM1598225"]
# Run enrichment analysis
enrich_turp <- chipenrich(peaks_comb[turp_peaks, ], genome="hg19",
genesets = "hallmark", out_name = NULL,
locusdef = "nearest_tss", qc_plots=FALSE)
# Print the results of the analysis
print(enrich_turp$results)
# Examine the top gene sets
head(enrich_primary$results)
# Extract the gene IDs for the top ranking set
genes <- enrich_primary$results$Geneset.Peak.Genes[1]
# Split gene IDs into a vector
genes_split <- strsplit(genes, ', ')[[1]]
# Convert gene IDs to gene symbols
gene_symbol <- select(org.Hs.eg.db, keys=genes_split, columns="SYMBOL", keytype="ENTREZID")
# Print the result
print(gene_symbol)
# This is the base URL for all KEGG pathways
base_url <- "https://www.kegg.jp/pathway/"
# Add pathway ID to URL
path_url <- paste0(base_url, top_path, collapse="+")
# Collapse gene IDs into selection string
gene_select <- paste(genes, collapse="+")
# Add gene IDs to URL
path_url <- paste(path_url, gene_select, sep="+")
Chapter 1 - Principles
Fundamentals:
Types of Data and Endpoints:
Basic Statistical Analysis:
Example code includes:
Acupuncture <- readRDS("./RInputFiles/Ex1_1_1.Rds")
#Explore the Acupuncture dataset with the str() function
str(Acupuncture)
## 'data.frame': 396 obs. of 18 variables:
## $ id : num 100 101 104 105 108 112 113 114 126 130 ...
## $ age : num 47 52 32 53 56 45 45 49 47 46 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
## $ migraine : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ chronicity : num 35 8 14 10 40 27 30 49 42 3 ...
## $ treatment.group : Factor w/ 2 levels "Acupuncture",..: 1 2 2 2 2 1 1 1 2 1 ...
## $ score.baseline : num 10.8 9.5 16 32.5 16.5 ...
## $ score.baseline.4 : Factor w/ 4 levels "[6.75,15.2]",..: 1 1 2 3 2 1 4 3 2 3 ...
## $ age.group : Factor w/ 4 levels "18-34","35-44",..: 3 3 1 3 4 3 3 3 3 3 ...
## $ score.month3 : num NA NA NA 44 17.5 ...
## $ score.month12 : num NA NA 15.3 NA 23.2 ...
## $ withdrawal.reason : Factor w/ 7 levels "adverse effects",..: 5 7 NA 7 NA NA NA NA NA NA ...
## $ completedacupuncturetreatment: num NA NA NA NA NA 1 NA NA NA NA ...
## $ completer : num 0 0 1 0 1 1 1 1 1 1 ...
## $ total.therap.visits : num NA 2 0 2 0 0 7 0 0 10 ...
## $ total.gp.visits : num NA 4 0 0 0 5 1 0 1 0 ...
## $ total.spec.visits : num NA 0 0 0 0 0 0 0 0 0 ...
## $ total.days.sick : num NA 6 3 NA 23 2 6 9 19 0 ...
#Display the treatment group frequencies
table(Acupuncture$treatment.group)
##
## Acupuncture Control
## 202 194
#Generate summaries of the variables by treatment group and save results as baselines
baselines <- compareGroups::compareGroups(treatment.group ~ score.baseline + age + sex, data = Acupuncture)
## Registered S3 method overwritten by 'SNPassoc':
## method from
## summary.haplo.glm haplo.stats
#Use the createTable function to display the results saved in baselines
baseline.table <- compareGroups::createTable(baselines, show.ratio = FALSE, show.p.overall=FALSE)
#Display the created summary table
baseline.table
##
## --------Summary descriptives table by 'treatment.group'---------
##
## ______________________________________
## Acupuncture Control
## N=202 N=194
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## score.baseline 25.8 (15.5) 27.6 (16.8)
## age 45.6 (10.6) 45.3 (11.5)
## sex:
## Female 169 (83.7%) 164 (84.5%)
## Male 33 (16.3%) 30 (15.5%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
#Generate a variable for the change from baseline at 12 months
Acupuncture$diff.month12 <- Acupuncture$score.month12 - Acupuncture$score.baseline
#Use the new variable to generate the percentage change from baseline at 12 months
Acupuncture$pct.month12 <- Acupuncture$diff.month12 / Acupuncture$score.baseline * 100
#Generate a histogram for percentage change from baseline within each treatment group
ggplot(data=Acupuncture, aes(x=pct.month12)) +
geom_histogram(fill="white", color="black") + facet_wrap( ~ treatment.group) +
xlab("Percentage Change from Baseline at Month 12")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 100 rows containing non-finite values (stat_bin).
#Generate the binary response variable.
Acupuncture$resp35.month12 <- ifelse(Acupuncture$pct.month12 < (-35), 1, 0)
#Encode this new variable as a factor.
Acupuncture$resp35.month12 <- factor(Acupuncture$resp35.month12,
levels = c(1,0),
labels=c("greater than 35%", "less than or eq to 35%")
)
#Tabulate the numbers and percentages of patients in each category.
Acupuncture %>%
group_by(resp35.month12) %>%
filter(!is.na(resp35.month12)) %>%
summarise(n = n()) %>%
mutate(pct = n / sum(n)*100)
## Warning: Factor `resp35.month12` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 2 x 3
## resp35.month12 n pct
## <fct> <int> <dbl>
## 1 greater than 35% 132 44.6
## 2 less than or eq to 35% 164 55.4
#Dichotomize the variable for complementary therapist visits into 0 or at least 1 visit.
Acupuncture$any.therap.visits <- ifelse(Acupuncture$total.therap.visits == 0, 0, 1)
#Encode the new variable as a factor
Acupuncture$any.therap.visits <- factor(Acupuncture$any.therap.visits,
levels = c(0,1),
labels=c("Did not visit CT", "Visited CT")
)
#Dichotomize the variable for complementary therapist visits into 0 or at least 1 visit.
Acupuncture$any.gp.visits <- ifelse(Acupuncture$total.gp.visits == 0, 0, 1)
#Encode the new variable as a factor
Acupuncture$any.gp.visits <- factor(Acupuncture$any.gp.visits,
levels = c(0,1),
labels=c("Did not visit GP", "Visited GP")
)
#Dichotomize the variable for complementary therapist visits into 0 or at least 1 visit.
Acupuncture$any.spec.visits <- ifelse(Acupuncture$total.spec.visits == 0, 0, 1)
#Encode the new variable as a factor
Acupuncture$any.spec.visits <- factor(Acupuncture$any.spec.visits,
levels = c(0,1),
labels=c("Did not visit specialist", "Visited specialist")
)
#Generate a combined binary endpoint for having any professional visits.
Acupuncture$combined <- ifelse(Acupuncture$any.therap.visits=="Did not visit CT" &
Acupuncture$any.gp.visits=="Did not visit GP" &
Acupuncture$any.spec.visits=="Did not visit specialist", 0, 1
)
#Encode the new variable as a factor
Acupuncture$combined <- factor(Acupuncture$combined,
levels = c(0,1),
labels=c("No visits", "At least one visit")
)
#Tabulate the new composite endpoint.
table(Acupuncture$combined, useNA="ifany")
##
## No visits At least one visit <NA>
## 118 211 67
#Perform the t-test, assuming the variances are equal in the treatment groups
t.test(pct.month12 ~ treatment.group, var.equal=TRUE, data = Acupuncture)
##
## Two Sample t-test
##
## data: pct.month12 by treatment.group
## t = -3.3531, df = 294, p-value = 0.0009039
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -25.176008 -6.552921
## sample estimates:
## mean in group Acupuncture mean in group Control
## -32.36822 -16.50376
#Use the compareGroups function to save a summary of the results in pct.month12.test
pct.month12.test <- compareGroups::compareGroups(treatment.group ~ pct.month12, data = Acupuncture)
#Use the createTable function to summarize and store the results saved in pct.month12.test.
pct.month12.table <- compareGroups::createTable(pct.month12.test, show.ratio = FALSE, show.p.overall=TRUE)
#Display the results of pct.month12.table
pct.month12.table
##
## --------Summary descriptives table by 'treatment.group'---------
##
## _________________________________________________
## Acupuncture Control p.overall
## N=158 N=138
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## pct.month12 -32.37 (42.3) -16.50 (38.6) 0.001
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
#Use a boxplot to visualize the total days off sick by treatment group.
ggplot(data=Acupuncture, aes(x=treatment.group, y=total.days.sick)) +
geom_boxplot(fill="white", color="black") +
ylab("Total days off sick") + xlab("Treatment group")
## Warning: Removed 68 rows containing non-finite values (stat_boxplot).
#Use the Wilcoxon Rank Sum test to compare the two distributions.
wilcox.test(total.days.sick ~ treatment.group, data=Acupuncture)
##
## Wilcoxon rank sum test with continuity correction
##
## data: total.days.sick by treatment.group
## W = 11991, p-value = 0.09957
## alternative hypothesis: true location shift is not equal to 0
#Perform the test of proportions on resp35.month12 by treatment.group.
prop.test(table(Acupuncture$treatment.group, Acupuncture$resp35.month12), correct=FALSE)
##
## 2-sample test for equality of proportions without continuity
## correction
##
## data: table(Acupuncture$treatment.group, Acupuncture$resp35.month12)
## X-squared = 15.032, df = 1, p-value = 0.0001057
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.1143954 0.3346965
## sample estimates:
## prop 1 prop 2
## 0.5506329 0.3260870
#Use the tidy function to store and display a summary of the test results.
resp35.month12.test <- broom::tidy(prop.test(table(Acupuncture$treatment.group,
Acupuncture$resp35.month12
), correct=FALSE
)
)
resp35.month12.test
## # A tibble: 1 x 9
## estimate1 estimate2 statistic p.value parameter conf.low conf.high method
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 0.551 0.326 15.0 1.06e-4 1 0.114 0.335 2-sam~
## # ... with 1 more variable: alternative <chr>
#Calculate the treatment difference
resp35.month12.test$estimate1 - resp35.month12.test$estimate2
## [1] 0.224546
Chapter 2 - Trial Designs
Randomization Methods:
Crossover, Factorial, Cluster Randomized Trials:
Equivalence and Non-Inferiority Trials:
Bioequivalence trials:
Example code includes:
#Generate a vector to store treatment labels “A” and “B”
set.seed(123)
arm<-c("A", "B")
#Randomly select treatment arm 14 times with the sample function and store in a vector
simple <- sample(arm, 14, replace=TRUE)
#Display the contents of the vector
simple
## [1] "B" "B" "B" "B" "B" "A" "B" "A" "B" "A" "B" "A" "A" "A"
#Tabulate the numbers assigned to each treatment.
table(simple)
## simple
## A B
## 6 8
#Use the blockrand function for 14 patients, two arms and block size 2.
set.seed(123)
block2 <- blockrand::blockrand(n=14, num.levels = 2, block.prefix='B', block.sizes = c(1,1))
#Display the list.
block2
## id block.id block.size treatment
## 1 1 B1 2 B
## 2 2 B1 2 A
## 3 3 B2 2 B
## 4 4 B2 2 A
## 5 5 B3 2 A
## 6 6 B3 2 B
## 7 7 B4 2 B
## 8 8 B4 2 A
## 9 9 B5 2 A
## 10 10 B5 2 B
## 11 11 B6 2 A
## 12 12 B6 2 B
## 13 13 B7 2 B
## 14 14 B7 2 A
#Tabulate the numbers per treatment arm.
table(block2$treatment)
##
## A B
## 7 7
#Use block randomization to produce lists of length 100 and random block sizes between 2 and 8.
set.seed(123)
under55 <- blockrand::blockrand(n=100, num.levels = 2, block.sizes = 1:4,
id.prefix='U55', block.prefix='U55', stratum='<55y'
)
above55 <- blockrand::blockrand(n=100, num.levels = 2, block.sizes = 1:4,
id.prefix='A55', block.prefix='A55',stratum='>=55y'
)
#Explore the two lists
head(under55)
## id stratum block.id block.size treatment
## 1 U55001 <55y U5501 6 B
## 2 U55002 <55y U5501 6 A
## 3 U55003 <55y U5501 6 B
## 4 U55004 <55y U5501 6 A
## 5 U55005 <55y U5501 6 A
## 6 U55006 <55y U5501 6 B
head(above55)
## id stratum block.id block.size treatment
## 1 A55001 >=55y A5501 6 A
## 2 A55002 >=55y A5501 6 B
## 3 A55003 >=55y A5501 6 A
## 4 A55004 >=55y A5501 6 A
## 5 A55005 >=55y A5501 6 B
## 6 A55006 >=55y A5501 6 B
#Tabulate the numbers assigned to each treatment within each strata
table(under55$treatment)
##
## A B
## 53 53
table(above55$treatment)
##
## A B
## 51 51
fact.data <- readRDS("./RInputFiles/fact.data.Rds")
str(fact.data)
## 'data.frame': 502 obs. of 3 variables:
## $ glutamine: Factor w/ 2 levels "No","Yes": 2 1 1 2 1 1 2 2 1 1 ...
## $ selenium : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 1 1 1 2 2 ...
## $ infection: Factor w/ 2 levels "No","Yes": 2 1 2 2 1 2 2 1 1 1 ...
#Explore the fact.data using the head function.
head(fact.data)
## glutamine selenium infection
## 1 Yes No Yes
## 2 No No No
## 3 No Yes Yes
## 4 Yes No Yes
## 5 No Yes No
## 6 No No Yes
#Display the numbers with and without infections by supplement combination.
fact.data %>%
count(glutamine, selenium, infection)
## # A tibble: 8 x 4
## glutamine selenium infection n
## <fct> <fct> <fct> <int>
## 1 No No No 57
## 2 No No Yes 68
## 3 No Yes No 64
## 4 No Yes Yes 63
## 5 Yes No No 55
## 6 Yes No Yes 71
## 7 Yes Yes No 61
## 8 Yes Yes Yes 63
#Display the numbers and proportions with infections for those who received glutamine.
fact.data %>%
group_by(glutamine) %>%
filter(infection=="Yes") %>%
summarise (n = n()) %>%
mutate(prop = n / sum(n))
## # A tibble: 2 x 3
## glutamine n prop
## <fct> <int> <dbl>
## 1 No 131 0.494
## 2 Yes 134 0.506
#Display the numbers and proportions with infections for those who received selenium.
fact.data %>%
group_by(selenium) %>%
filter(infection=="Yes") %>%
summarise (n = n()) %>%
mutate(prop = n / sum(n))
## # A tibble: 2 x 3
## selenium n prop
## <fct> <int> <dbl>
## 1 No 139 0.525
## 2 Yes 126 0.475
#Display the numbers with and without infections by supplement combination.
fact.data %>%
count(glutamine, selenium, infection)
## # A tibble: 8 x 4
## glutamine selenium infection n
## <fct> <fct> <fct> <int>
## 1 No No No 57
## 2 No No Yes 68
## 3 No Yes No 64
## 4 No Yes Yes 63
## 5 Yes No No 55
## 6 Yes No Yes 71
## 7 Yes Yes No 61
## 8 Yes Yes Yes 63
#Display the numbers and proportions with infections for those who received glutamine.
fact.data %>%
group_by(infection) %>%
filter(glutamine=="Yes") %>%
summarise (n = n()) %>%
mutate(prop = n / sum(n))
## # A tibble: 2 x 3
## infection n prop
## <fct> <int> <dbl>
## 1 No 116 0.464
## 2 Yes 134 0.536
#Display the numbers and proportions with infections for those who received selenium.
fact.data %>%
group_by(infection) %>%
filter(selenium=="Yes") %>%
summarise (n = n()) %>%
mutate(prop = n / sum(n))
## # A tibble: 2 x 3
## infection n prop
## <fct> <int> <dbl>
## 1 No 125 0.498
## 2 Yes 126 0.502
#Calculate the effect of glutamine on infection
epitools::oddsratio.wald(fact.data$glutamine, fact.data$infection)
## $data
## Outcome
## Predictor No Yes Total
## No 121 131 252
## Yes 116 134 250
## Total 237 265 502
##
## $measure
## odds ratio with 95% C.I.
## Predictor estimate lower upper
## No 1.000000 NA NA
## Yes 1.066991 0.7515148 1.514901
##
## $p.value
## two-sided
## Predictor midp.exact fisher.exact chi.square
## No NA NA NA
## Yes 0.7180246 0.7216211 0.7169009
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
#Calculate the effect of selenium on infection
epitools::oddsratio.wald(fact.data$selenium, fact.data$infection)
## $data
## Outcome
## Predictor No Yes Total
## No 112 139 251
## Yes 125 126 251
## Total 237 265 502
##
## $measure
## odds ratio with 95% C.I.
## Predictor estimate lower upper
## No 1.0000000 NA NA
## Yes 0.8122014 0.5718144 1.153646
##
## $p.value
## two-sided
## Predictor midp.exact fisher.exact chi.square
## No NA NA NA
## Yes 0.2469929 0.2833307 0.2451355
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
relapse.trial <- data.frame(Treatment=rep(c("New", "Standard"), times=c(264, 263)),
Relapse=rep(rep(c("At least one relapse", "No relapse"), times=2),
times=c(184, 80, 169, 94)
),
stringsAsFactors = TRUE
)
str(relapse.trial)
## 'data.frame': 527 obs. of 2 variables:
## $ Treatment: Factor w/ 2 levels "New","Standard": 1 1 1 1 1 1 1 1 1 1 ...
## $ Relapse : Factor w/ 2 levels "At least one relapse",..: 1 1 1 1 1 1 1 1 1 1 ...
table(relapse.trial)
## Relapse
## Treatment At least one relapse No relapse
## New 184 80
## Standard 169 94
#Use the head function to explore the relapse.trial dataset
head(relapse.trial)
## Treatment Relapse
## 1 New At least one relapse
## 2 New At least one relapse
## 3 New At least one relapse
## 4 New At least one relapse
## 5 New At least one relapse
## 6 New At least one relapse
#Calculate the number of percentages of relapse by treatment group
relapse.trial %>%
group_by(Treatment, Relapse) %>%
summarise(n = n()) %>%
mutate(pct = (n / sum(n))*100)
## # A tibble: 4 x 4
## # Groups: Treatment [2]
## Treatment Relapse n pct
## <fct> <fct> <int> <dbl>
## 1 New At least one relapse 184 69.7
## 2 New No relapse 80 30.3
## 3 Standard At least one relapse 169 64.3
## 4 Standard No relapse 94 35.7
#Calculate the two-sided 90% confidence interval for the difference
prop.test(table(relapse.trial$Treatment, relapse.trial$Relapse),
alternative = "two.sided", conf.level=0.9, correct=FALSE
)
##
## 2-sample test for equality of proportions without continuity
## correction
##
## data: table(relapse.trial$Treatment, relapse.trial$Relapse)
## X-squared = 1.7619, df = 1, p-value = 0.1844
## alternative hypothesis: two.sided
## 90 percent confidence interval:
## -0.01289979 0.12166808
## sample estimates:
## prop 1 prop 2
## 0.6969697 0.6425856
PKData <- readRDS("./RInputFiles/PKData.Rds")
str(PKData)
## Classes 'tbl_df', 'tbl' and 'data.frame': 12 obs. of 6 variables:
## $ subject.id : num 1001 1001 1001 1001 1001 ...
## $ sample.id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ time : chr "predose" "15min" "30min" "1h" ...
## $ rel.time : num -0.25 0.25 0.5 1 1.5 2 3 4 6 8 ...
## $ plasma.conc: chr "<LLQ" "2.19" "7.28" "12.98" ...
## $ unit : chr "ng/mL" "ng/mL" "ng/mL" "ng/mL" ...
#Display the dataset contents
head(PKData)
## # A tibble: 6 x 6
## subject.id sample.id time rel.time plasma.conc unit
## <dbl> <dbl> <chr> <dbl> <chr> <chr>
## 1 1001 1 predose -0.25 <LLQ ng/mL
## 2 1001 2 15min 0.25 2.19 ng/mL
## 3 1001 3 30min 0.5 7.28 ng/mL
## 4 1001 4 1h 1 12.98 ng/mL
## 5 1001 5 1.5h 1.5 10.76 ng/mL
## 6 1001 6 2h 2 9.01 ng/mL
#Store a numeric version of the concentration variable in plasma.conc.n
PKData$plasma.conc.n <- as.numeric(PKData$plasma.conc)
## Warning: NAs introduced by coercion
#Use ggplot to plot the concentration levels against relative time
ggplot(data=PKData, aes(x=rel.time, y=plasma.conc.n)) +
geom_line() +
geom_point() + ggtitle("Individual Concentration Profile") +
xlab("Time Relative to First Dose, h") +
ylab("Plasma Concentration, ng/mL")
## Warning: Removed 2 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).
#Use the summary function to find the max concentration level
summary(PKData$plasma.conc.n)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.610 2.388 5.315 5.750 8.578 12.980 2
#Use pk.calc.tmax to find when Cmax occurred, specifying the concentration and time.
PKNCA::pk.calc.tmax(PKData$plasma.conc.n, PKData$rel.time)
## [1] 1
#Use pk.calc.cmax to estimate AUC between 0.25 and 12hrs.
PKNCA::pk.calc.auc(PKData$plasma.conc.n, PKData$rel.time, interval=c(0.25, 12), method="linear")
## [1] 43.33125
Chapter 3 - Sample Size and Power
Introduction to Sample Size and Power:
Sample Size Adjustments:
Interim Analyses and Stopping Rules:
Sample Size for Alternative Trial Designs:
Example code includes:
#Generate the sample size for delta of 1, with SD of 3 and 80% power.
ss1 <- power.t.test(delta=1, sd=3, power=0.8)
ss1
##
## Two-sample t test power calculation
##
## n = 142.2466
## delta = 1
## sd = 3
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
#Round up and display the numbers needed per group
ceiling(ss1$n)
## [1] 143
#Use the sample size from above to show that it provides 80% power
power.t.test(n=ceiling(ss1$n), delta=1, sd=3)
##
## Two-sample t test power calculation
##
## n = 143
## delta = 1
## sd = 3
## sig.level = 0.05
## power = 0.802082
## alternative = two.sided
##
## NOTE: n is number in *each* group
#Generate a vector containing values between 0.5 and 2.0, incrementing by 0.25
delta <- seq(0.5, 2, 0.25)
npergp <- NULL
#Specify the standard deviation and power
for(i in 1:length(delta)){
npergp[i] <- ceiling(power.t.test(delta = delta[i], sd = 3, power = 0.8)$n)
}
#Create a data frame from the deltas and sample sizes
sample.sizes <- data.frame(delta, npergp)
#Plot the patients per group against the treatment differences
ggplot(data=sample.sizes, aes(x=delta, y=npergp)) +
geom_line() +
geom_point() +
ggtitle("Sample Size Scenarios") +
xlab("Treatment Difference") +
ylab("Patients per Group")
#Use the power.prop.test to generate sample sizes for the proportions
power.prop.test(p1 = 0.4, p2 = 0.6, power = 0.8)
##
## Two-sample comparison of proportions power calculation
##
## n = 96.92364
## p1 = 0.4
## p2 = 0.6
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
#Find the minimum detectable percentage for the above using 150 patients per group.
power.prop.test(p1 = 0.4, power = 0.8, n = 150)$p2*100
## [1] 56.0992
#Use 90% power, delta 1.5, standard deviations of 2.5, fraction of 0.5
unequalgps <- samplesize::n.ttest(power = 0.9, alpha = 0.05, mean.diff = 1.5,
sd1 = 2.5, sd2 = 2.5, k = 0.5,
design = "unpaired", fraction = "unbalanced"
)
unequalgps
## $`Total sample size`
## [1] 135
##
## $`Sample size group 1`
## [1] 90
##
## $`Sample size group 2`
## [1] 45
##
## $Fraction
## [1] 0.5
#Generate sample sizes comparing the proportions using a two-sided test
two.sided <- power.prop.test(p1=0.1, p2=0.3, power=0.8, alternative = "two.sided")
two.sided
##
## Two-sample comparison of proportions power calculation
##
## n = 61.5988
## p1 = 0.1
## p2 = 0.3
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
ceiling(two.sided$n)
## [1] 62
#Repeat using a one-sided test
one.sided <- power.prop.test(p1=0.1, p2=0.3, power=0.8, alternative = "one.sided")
one.sided
##
## Two-sample comparison of proportions power calculation
##
## n = 48.40295
## p1 = 0.1
## p2 = 0.3
## sig.level = 0.05
## power = 0.8
## alternative = one.sided
##
## NOTE: n is number in *each* group
ceiling(one.sided$n)
## [1] 49
#Display the reduction per group
ceiling(two.sided$n)- ceiling(one.sided$n)
## [1] 13
#Use the gsDesign function to generate the p-values for four analyses under the Pocock rule
Pocock <- gsDesign::gsDesign(k=4, test.type=2, sfu="Pocock")
Pocock
## Symmetric two-sided group sequential design with
## 90 % power and 2.5 % Type I Error.
## Spending computations assume trial stops
## if a bound is crossed.
##
## Sample
## Size
## Analysis Ratio* Z Nominal p Spend
## 1 0.296 2.36 0.0091 0.0091
## 2 0.592 2.36 0.0091 0.0067
## 3 0.887 2.36 0.0091 0.0051
## 4 1.183 2.36 0.0091 0.0041
## Total 0.0250
##
## ++ alpha spending:
## Pocock boundary.
## * Sample size ratio compared to fixed design with no interim
##
## Boundary crossing probabilities and expected sample size
## assume any cross stops the trial
##
## Upper boundary (power or Type I Error)
## Analysis
## Theta 1 2 3 4 Total E{N}
## 0.0000 0.0091 0.0067 0.0051 0.0041 0.025 1.1561
## 3.2415 0.2748 0.3059 0.2056 0.1136 0.900 0.6975
##
## Lower boundary (futility or Type II Error)
## Analysis
## Theta 1 2 3 4 Total
## 0.0000 0.0091 0.0067 0.0051 0.0041 0.025
## 3.2415 0.0000 0.0000 0.0000 0.0000 0.000
2*(1-pnorm(Pocock$upper$bound))
## [1] 0.01821109 0.01821109 0.01821109 0.01821109
#Repeat for the the O'Brein & Fleming rule
OF <- gsDesign::gsDesign(k=4, test.type=2, sfu="OF")
OF
## Symmetric two-sided group sequential design with
## 90 % power and 2.5 % Type I Error.
## Spending computations assume trial stops
## if a bound is crossed.
##
## Sample
## Size
## Analysis Ratio* Z Nominal p Spend
## 1 0.256 4.05 0.0000 0.0000
## 2 0.511 2.86 0.0021 0.0021
## 3 0.767 2.34 0.0097 0.0083
## 4 1.022 2.02 0.0215 0.0145
## Total 0.0250
##
## ++ alpha spending:
## O'Brien-Fleming boundary.
## * Sample size ratio compared to fixed design with no interim
##
## Boundary crossing probabilities and expected sample size
## assume any cross stops the trial
##
## Upper boundary (power or Type I Error)
## Analysis
## Theta 1 2 3 4 Total E{N}
## 0.0000 0.000 0.0021 0.0083 0.0145 0.025 1.0157
## 3.2415 0.008 0.2850 0.4031 0.2040 0.900 0.7674
##
## Lower boundary (futility or Type II Error)
## Analysis
## Theta 1 2 3 4 Total
## 0.0000 0 0.0021 0.0083 0.0145 0.025
## 3.2415 0 0.0000 0.0000 0.0000 0.000
2*(1-pnorm(OF$upper$bound))
## [1] 5.152685e-05 4.199337e-03 1.941553e-02 4.293975e-02
#Use the gsDesign function to generate the sample sizes at each stage under the Pocock rule
Pocock.ss <- gsDesign::gsDesign(k=4, test.type=2, sfu="Pocock", n.fix=500, beta=0.1)
ceiling(Pocock.ss$n.I)
## [1] 148 296 444 592
#Repeat for the the O'Brein-Fleming rule
OF.ss <- gsDesign::gsDesign(k=4, test.type=2, sfu="OF", n.fix=500, beta=0.1)
ceiling(OF.ss$n.I)
## [1] 128 256 384 512
#Find the sample size per group for expected rates of 60%, 4% delta, 90% power and 5% significance level.
TOSTER::powerTOSTtwo.prop(alpha = 0.05, statistical_power = 0.9, prop1 = 0.6, prop2 = 0.6,
low_eqbound_prop = -0.04, high_eqbound_prop = 0.04
)
## The required sample size to achieve 90 % power with equivalence bounds of -0.04 and 0.04 is 3247
##
## [1] 3246.652
#Find the power if the above trial is limited to 2500 per group
TOSTER::powerTOSTtwo.prop(alpha = 0.05, N=2500, prop1 = 0.6, prop2 = 0.6,
low_eqbound_prop = -0.04, high_eqbound_prop = 0.04
)
## The statistical power is 78.57 % for equivalence bounds of -0.04 and 0.04 .
##
## [1] 0.7857316
#Find the sample size for a standard deviation of 10, delta of 2, 80% power and 5% significance level.
TOSTER::powerTOSTtwo.raw(alpha=0.05, statistical_power=0.8, sdpooled=10, low_eqbound=-2, high_eqbound=2)
## The required sample size to achieve 80 % power with equivalence bounds of -2 and 2 is 428.1924 per group, or 858 in total.
##
## [1] 428.1924
#Find the sample sizes based on standard deviations between 7 and 13.
stdev <- seq(7, 13, 1)
npergp <- NULL
for(i in 1:length(stdev)){
npergp[i] <- ceiling(TOSTER::powerTOSTtwo.raw(alpha=0.05, statistical_power=0.8, sdpooled=stdev[i],
low_eqbound=-2, high_eqbound=2
)
)
}
## The required sample size to achieve 80 % power with equivalence bounds of -2 and 2 is 209.8143 per group, or 420 in total.
##
## The required sample size to achieve 80 % power with equivalence bounds of -2 and 2 is 274.0431 per group, or 550 in total.
##
## The required sample size to achieve 80 % power with equivalence bounds of -2 and 2 is 346.8358 per group, or 694 in total.
##
## The required sample size to achieve 80 % power with equivalence bounds of -2 and 2 is 428.1924 per group, or 858 in total.
##
## The required sample size to achieve 80 % power with equivalence bounds of -2 and 2 is 518.1128 per group, or 1038 in total.
##
## The required sample size to achieve 80 % power with equivalence bounds of -2 and 2 is 616.597 per group, or 1234 in total.
##
## The required sample size to achieve 80 % power with equivalence bounds of -2 and 2 is 723.6451 per group, or 1448 in total.
##
sample.sizes <- data.frame(stdev, npergp)
#Plot npergp against stdev
ggplot(data=sample.sizes, aes(x=stdev, y=npergp)) +
geom_line() +
geom_point() +
ggtitle("Equivalence Sample Size Scenarios") +
xlab("Standard Deviation") +
ylab("Patients per Group")
Chapter 4 - Statistical Analysis
Regression Analysis:
Analysis Sets, Subgroups, and Interactions:
Multiplicity of Data:
Wrap up:
Example code includes:
#Explore the variable names with the str function
str(Acupuncture)
## 'data.frame': 396 obs. of 25 variables:
## $ id : num 100 101 104 105 108 112 113 114 126 130 ...
## $ age : num 47 52 32 53 56 45 45 49 47 46 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
## $ migraine : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ chronicity : num 35 8 14 10 40 27 30 49 42 3 ...
## $ treatment.group : Factor w/ 2 levels "Acupuncture",..: 1 2 2 2 2 1 1 1 2 1 ...
## $ score.baseline : num 10.8 9.5 16 32.5 16.5 ...
## $ score.baseline.4 : Factor w/ 4 levels "[6.75,15.2]",..: 1 1 2 3 2 1 4 3 2 3 ...
## $ age.group : Factor w/ 4 levels "18-34","35-44",..: 3 3 1 3 4 3 3 3 3 3 ...
## $ score.month3 : num NA NA NA 44 17.5 ...
## $ score.month12 : num NA NA 15.3 NA 23.2 ...
## $ withdrawal.reason : Factor w/ 7 levels "adverse effects",..: 5 7 NA 7 NA NA NA NA NA NA ...
## $ completedacupuncturetreatment: num NA NA NA NA NA 1 NA NA NA NA ...
## $ completer : num 0 0 1 0 1 1 1 1 1 1 ...
## $ total.therap.visits : num NA 2 0 2 0 0 7 0 0 10 ...
## $ total.gp.visits : num NA 4 0 0 0 5 1 0 1 0 ...
## $ total.spec.visits : num NA 0 0 0 0 0 0 0 0 0 ...
## $ total.days.sick : num NA 6 3 NA 23 2 6 9 19 0 ...
## $ diff.month12 : num NA NA -0.667 NA 6.75 ...
## $ pct.month12 : num NA NA -4.17 NA 40.91 ...
## $ resp35.month12 : Factor w/ 2 levels "greater than 35%",..: NA NA 2 NA 2 2 2 2 2 1 ...
## $ any.therap.visits : Factor w/ 2 levels "Did not visit CT",..: NA 2 1 2 1 1 2 1 1 2 ...
## $ any.gp.visits : Factor w/ 2 levels "Did not visit GP",..: NA 2 1 1 1 2 2 1 2 1 ...
## $ any.spec.visits : Factor w/ 2 levels "Did not visit specialist",..: NA 1 1 1 1 1 1 1 1 1 ...
## $ combined : Factor w/ 2 levels "No visits","At least one visit": NA 2 1 2 1 2 2 1 2 2 ...
#Use the relevel function to set Control as reference group
Acupuncture$treatment.group <- relevel(Acupuncture$treatment.group, ref="Control")
#Use lm to run and store the model in linreg1
linreg1 <- lm(pct.month12 ~ treatment.group + sex + score.baseline.4, data=Acupuncture,
na.action = na.exclude
)
#Display the results of linreg1
summary(linreg1)
##
## Call:
## lm(formula = pct.month12 ~ treatment.group + sex + score.baseline.4,
## data = Acupuncture, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.71 -28.82 -4.89 24.48 130.39
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.805 5.460 -1.063 0.288565
## treatment.groupAcupuncture -16.473 4.718 -3.492 0.000555 ***
## sexMale 2.767 6.410 0.432 0.666317
## score.baseline.4(15.2,21.2] -9.878 6.447 -1.532 0.126584
## score.baseline.4(21.2,34.6] -18.822 6.717 -2.802 0.005415 **
## score.baseline.4(34.6,94.8] -15.786 6.657 -2.371 0.018375 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.23 on 290 degrees of freedom
## (100 observations deleted due to missingness)
## Multiple R-squared: 0.06742, Adjusted R-squared: 0.05134
## F-statistic: 4.193 on 5 and 290 DF, p-value: 0.001076
#Use lm to run and store the model in linreg2
linreg2 <- lm(pct.month12 ~ treatment.group + score.baseline.4, data=Acupuncture, na.action = na.exclude)
#Display the results of linreg2
summary(linreg2)
##
## Call:
## lm(formula = pct.month12 ~ treatment.group + score.baseline.4,
## data = Acupuncture, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -83.371 -29.273 -4.979 24.691 129.902
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.435 5.385 -1.009 0.313616
## treatment.groupAcupuncture -16.359 4.704 -3.478 0.000582 ***
## score.baseline.4(15.2,21.2] -9.824 6.437 -1.526 0.128053
## score.baseline.4(21.2,34.6] -18.850 6.707 -2.811 0.005281 **
## score.baseline.4(34.6,94.8] -15.783 6.647 -2.374 0.018234 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.18 on 291 degrees of freedom
## (100 observations deleted due to missingness)
## Multiple R-squared: 0.06682, Adjusted R-squared: 0.05399
## F-statistic: 5.209 on 4 and 291 DF, p-value: 0.0004574
#Add the predicted values to the Acupuncture dataset for linreg2 using the predict function
Acupuncture$pred.linreg2 <- predict(linreg2)
#Plot the predicted values against baseline score quartile grouping by treatment.
ggplot(data = subset(Acupuncture, !is.na(pred.linreg2)),
aes(x = score.baseline.4, y = pred.linreg2, group = treatment.group)
) +
geom_line(aes(color = treatment.group)) +
geom_point(aes(color = treatment.group)) +
ggtitle("Predicted Values from Linear Regression Model") +
xlab("Baseline Score Quartile") +
ylab("Percentage Change from Baseline at M12")
#Use the relevel function to set "Control" as the reference for treatment
Acupuncture$treatment.group <- relevel(Acupuncture$treatment.group, ref="Control")
#Use the relevel function to set "less than or eq to 35%" as the reference for resp35.month12
Acupuncture$resp35.month12 <- relevel(Acupuncture$resp35.month12, ref="less than or eq to 35%")
#Use glm to run and store the model in logreg1
logreg1 <- glm(resp35.month12 ~ treatment.group + sex + score.baseline.4,
family=binomial(link="logit"), data=Acupuncture
)
#Display the results of logreg1
summary(logreg1)
##
## Call:
## glm(formula = resp35.month12 ~ treatment.group + sex + score.baseline.4,
## family = binomial(link = "logit"), data = Acupuncture)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4033 -0.9907 -0.7770 1.1163 1.7695
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0430 0.2935 -3.553 0.00038 ***
## treatment.groupAcupuncture 0.9734 0.2465 3.949 7.84e-05 ***
## sexMale -0.2882 0.3340 -0.863 0.38818
## score.baseline.4(15.2,21.2] 0.5463 0.3359 1.626 0.10386
## score.baseline.4(21.2,34.6] 0.5865 0.3487 1.682 0.09258 .
## score.baseline.4(34.6,94.8] 0.2151 0.3481 0.618 0.53664
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 406.88 on 295 degrees of freedom
## Residual deviance: 386.90 on 290 degrees of freedom
## (100 observations deleted due to missingness)
## AIC: 398.9
##
## Number of Fisher Scoring iterations: 4
#Display the odds ratio and 95% CI for Acupuncture vs Control
exp(coefficients(logreg1)[2])
## treatment.groupAcupuncture
## 2.646798
exp(confint(logreg1)[2,])
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## 1.640863 4.318076
#Tabulate withdrawal.reason
table(Acupuncture$withdrawal.reason, useNA="ifany")
##
## adverse effects died intercurrent illness
## 1 1 16
## lost to follow-up treatment hassle treatment ineffective
## 15 5 4
## withdrew consent <NA>
## 58 296
#Tabulate completedacupuncturetreatment by treatment.group
table(Acupuncture$completedacupuncturetreatment, Acupuncture$treatment.group, useNA="ifany")
##
## Control Acupuncture
## 0 0 35
## 1 1 131
## <NA> 193 36
#Create a per protocol flag that is TRUE if patients met the criteria
Acupuncture <- Acupuncture %>%
mutate(pp = is.na(withdrawal.reason) &
((completedacupuncturetreatment==1 & treatment.group=="Acupuncture") |
(is.na(completedacupuncturetreatment) & treatment.group=="Control")
)
)
Acupuncture$pp[is.na(Acupuncture$pp)] <- FALSE
Acupuncture$pp <- as.factor(Acupuncture$pp)
#Tabulate the per protocol flag
table(Acupuncture$pp)
##
## FALSE TRUE
## 144 252
#Use the relevel function to set Control as reference group
Acupuncture$treatment.group <- relevel(Acupuncture$treatment.group, ref="Control")
#Use lm to run and store the model without interaction in linregnoint
linregnoint <- lm(pct.month12 ~ treatment.group + score.baseline.4, Acupuncture, na.action = na.exclude)
#Use lm to run and store the model with interaction in linregint
linregint <- lm(pct.month12 ~ treatment.group*score.baseline.4, Acupuncture, na.action = na.exclude)
#Display the results of linregnoint and linregint
summary(linregnoint)
##
## Call:
## lm(formula = pct.month12 ~ treatment.group + score.baseline.4,
## data = Acupuncture, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -83.371 -29.273 -4.979 24.691 129.902
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.435 5.385 -1.009 0.313616
## treatment.groupAcupuncture -16.359 4.704 -3.478 0.000582 ***
## score.baseline.4(15.2,21.2] -9.824 6.437 -1.526 0.128053
## score.baseline.4(21.2,34.6] -18.850 6.707 -2.811 0.005281 **
## score.baseline.4(34.6,94.8] -15.783 6.647 -2.374 0.018234 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.18 on 291 degrees of freedom
## (100 observations deleted due to missingness)
## Multiple R-squared: 0.06682, Adjusted R-squared: 0.05399
## F-statistic: 5.209 on 4 and 291 DF, p-value: 0.0004574
summary(linregint)
##
## Call:
## lm(formula = pct.month12 ~ treatment.group * score.baseline.4,
## data = Acupuncture, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -84.497 -28.562 -5.565 26.525 126.990
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -9.662 7.239
## treatment.groupAcupuncture -9.220 9.407
## score.baseline.4(15.2,21.2] -4.471 9.543
## score.baseline.4(21.2,34.6] -10.655 10.412
## score.baseline.4(34.6,94.8] -12.425 9.875
## treatment.groupAcupuncture:score.baseline.4(15.2,21.2] -9.478 12.993
## treatment.groupAcupuncture:score.baseline.4(21.2,34.6] -14.057 13.644
## treatment.groupAcupuncture:score.baseline.4(34.6,94.8] -5.375 13.417
## t value Pr(>|t|)
## (Intercept) -1.335 0.183
## treatment.groupAcupuncture -0.980 0.328
## score.baseline.4(15.2,21.2] -0.469 0.640
## score.baseline.4(21.2,34.6] -1.023 0.307
## score.baseline.4(34.6,94.8] -1.258 0.209
## treatment.groupAcupuncture:score.baseline.4(15.2,21.2] -0.729 0.466
## treatment.groupAcupuncture:score.baseline.4(21.2,34.6] -1.030 0.304
## treatment.groupAcupuncture:score.baseline.4(34.6,94.8] -0.401 0.689
##
## Residual standard error: 40.3 on 288 degrees of freedom
## (100 observations deleted due to missingness)
## Multiple R-squared: 0.07059, Adjusted R-squared: 0.048
## F-statistic: 3.125 on 7 and 288 DF, p-value: 0.003398
#Compare the models with the anova command
anova(linregnoint, linregint)
## Analysis of Variance Table
##
## Model 1: pct.month12 ~ treatment.group + score.baseline.4
## Model 2: pct.month12 ~ treatment.group * score.baseline.4
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 291 469699
## 2 288 467802 3 1897.3 0.3893 0.7608
#Tabulate the age group variable to view the categories
table(Acupuncture$age.group)
##
## 18-34 35-44 45-54 55-65
## 75 83 159 79
#Display the adjusted significance level
0.05/4
## [1] 0.0125
#Run the Wilcoxon Rank Sum test in each of the age subgroups
age <- c("18-34", "35-44", "45-54", "55-65")
for(group in age){
subgroup <- broom::tidy(wilcox.test(total.days.sick ~ treatment.group,
data = subset(Acupuncture, age.group==group),
exact=FALSE
)
)
print(group)
print(subgroup)
}
## [1] "18-34"
## # A tibble: 1 x 4
## statistic p.value method alternative
## <dbl> <dbl> <chr> <chr>
## 1 518 0.0380 Wilcoxon rank sum test with continuity correcti~ two.sided
## [1] "35-44"
## # A tibble: 1 x 4
## statistic p.value method alternative
## <dbl> <dbl> <chr> <chr>
## 1 534 0.540 Wilcoxon rank sum test with continuity correcti~ two.sided
## [1] "45-54"
## # A tibble: 1 x 4
## statistic p.value method alternative
## <dbl> <dbl> <chr> <chr>
## 1 2557 0.130 Wilcoxon rank sum test with continuity correcti~ two.sided
## [1] "55-65"
## # A tibble: 1 x 4
## statistic p.value method alternative
## <dbl> <dbl> <chr> <chr>
## 1 630 0.678 Wilcoxon rank sum test with continuity correcti~ two.sided
#Tabulate the combined endpoint by treatment group
table(Acupuncture$combined, Acupuncture$treatment.group, useNA="ifany")
##
## Control Acupuncture
## No visits 53 65
## At least one visit 102 109
## <NA> 39 28
#Use the relevel function to set Control as reference group
Acupuncture$treatment.group <- relevel(Acupuncture$treatment.group, ref="Control")
#Use compareGroups to generate and save the treatment effect for the composite endpoint amd each component
combined.test <- compareGroups::compareGroups(treatment.group ~ combined + any.therap.visits + any.gp.visits + any.spec.visits, data = Acupuncture)
# Store the results in a table
combined.test.table <- compareGroups::createTable(combined.test, show.ratio = TRUE, show.p.overall=FALSE)
#Display the results
combined.test.table
##
## --------Summary descriptives table by 'treatment.group'---------
##
## _____________________________________________________________________________
## Control Acupuncture OR p.ratio
## N=155 N=174
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## combined:
## No visits 53 (34.2%) 65 (37.4%) Ref. Ref.
## At least one visit 102 (65.8%) 109 (62.6%) 0.87 [0.55;1.37] 0.553
## any.therap.visits:
## Did not visit CT 126 (81.3%) 142 (81.6%) Ref. Ref.
## Visited CT 29 (18.7%) 32 (18.4%) 0.98 [0.56;1.72] 0.940
## any.gp.visits:
## Did not visit GP 63 (40.6%) 85 (48.9%) Ref. Ref.
## Visited GP 92 (59.4%) 89 (51.1%) 0.72 [0.46;1.11] 0.138
## any.spec.visits:
## Did not visit specialist 145 (93.5%) 160 (92.0%) Ref. Ref.
## Visited specialist 10 (6.45%) 14 (8.05%) 1.26 [0.54;3.04] 0.590
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
Chapter 1 - Intro to Valuations (Cash is King)
Valuations Overview:
Business Models and Writing R Functions:
model <- assumptions model$revenue <- model$sales * price_per_unit model$direct_expense <- model$sales * cost_per_unit model$gross_profit <- model$revenue - model$direct_expenses Pro-Forma Income Statements:
Income to Cash:
Example code includes:
# define inputs
price <- 20
print_cost <- 0.5
ship_cost <- 2
assumptions <- data.frame(year=1:5, sales=c(175, 200, 180, 100, 50))
# add revenue, expense, and profit variables
cashflow <- assumptions
cashflow$revenue <- cashflow$sales * price
cashflow$direct_expense <- cashflow$sales * (print_cost + ship_cost)
cashflow$gross_profit <- cashflow$revenue - cashflow$direct_expense
# print cashflow
print(cashflow)
## year sales revenue direct_expense gross_profit
## 1 1 175 3500 437.5 3062.5
## 2 2 200 4000 500.0 3500.0
## 3 3 180 3600 450.0 3150.0
## 4 4 100 2000 250.0 1750.0
## 5 5 50 1000 125.0 875.0
prem_ts <- data.frame(MONTH=1:60, COST_PER_SONG=0.01, SONG_LENGTH=3, REV_PER_SUB=10) %>%
mutate(PCT_ACTIVE=0.95**(MONTH-1), HOURS_PER_MONTH=528-0.08*(MONTH-40)**2)
# premium business models
premium_model <- prem_ts
premium_model$SONGS_PLAYED <- prem_ts$PCT_ACTIVE * prem_ts$HOURS_PER_MONTH * 1 / prem_ts$SONG_LENGTH
premium_model$REV_SUBSCRIPTION <- prem_ts$PCT_ACTIVE * prem_ts$REV_PER_SUB
premium_model$COST_SONG_PLAYED <- premium_model$SONGS_PLAYED * prem_ts$COST_PER_SONG
# inspect results
head(premium_model)
## MONTH COST_PER_SONG SONG_LENGTH REV_PER_SUB PCT_ACTIVE HOURS_PER_MONTH
## 1 1 0.01 3 10 1.0000000 406.32
## 2 2 0.01 3 10 0.9500000 412.48
## 3 3 0.01 3 10 0.9025000 418.48
## 4 4 0.01 3 10 0.8573750 424.32
## 5 5 0.01 3 10 0.8145062 430.00
## 6 6 0.01 3 10 0.7737809 435.52
## SONGS_PLAYED REV_SUBSCRIPTION COST_SONG_PLAYED
## 1 135.4400 10.000000 1.354400
## 2 130.6187 9.500000 1.306187
## 3 125.8927 9.025000 1.258927
## 4 121.2671 8.573750 1.212671
## 5 116.7459 8.145062 1.167459
## 6 112.3324 7.737809 1.123324
free_ts <- data.frame(MONTH=1:60, PROP_MUSIC=0.95, REV_PER_AD=0.02, REV_PER_CLICK=10,
COST_PER_SONG=0.01, SONG_LENGTH=3, AD_LENGTH=0.25, CLICK_THROUGH_RATE=0.001
) %>%
mutate(PCT_ACTIVE=0.97**(MONTH-1), HOURS_PER_MONTH=480-0.08*(MONTH-40)**2)
# freemium business models
freemium_model <- free_ts
freemium_model$SONGS_PLAYED <- free_ts$PCT_ACTIVE * free_ts$HOURS_PER_MONTH * free_ts$PROP_MUSIC / free_ts$SONG_LENGTH
freemium_model$ADS_PLAYED <- free_ts$PCT_ACTIVE * free_ts$HOURS_PER_MONTH * (1-free_ts$PROP_MUSIC) / free_ts$AD_LENGTH
freemium_model$REV_AD_PLAYED <- freemium_model$ADS_PLAYED * free_ts$REV_PER_AD
freemium_model$REV_AD_CLICKED <- freemium_model$ADS_PLAYED * free_ts$CLICK_THROUGH_RATE * free_ts$REV_PER_CLICK
freemium_model$COST_SONG_PLAYED <- freemium_model$SONGS_PLAYED * free_ts$COST_PER_SONG
# examine output
head(freemium_model)
## MONTH PROP_MUSIC REV_PER_AD REV_PER_CLICK COST_PER_SONG SONG_LENGTH AD_LENGTH
## 1 1 0.95 0.02 10 0.01 3 0.25
## 2 2 0.95 0.02 10 0.01 3 0.25
## 3 3 0.95 0.02 10 0.01 3 0.25
## 4 4 0.95 0.02 10 0.01 3 0.25
## 5 5 0.95 0.02 10 0.01 3 0.25
## 6 6 0.95 0.02 10 0.01 3 0.25
## CLICK_THROUGH_RATE PCT_ACTIVE HOURS_PER_MONTH SONGS_PLAYED ADS_PLAYED
## 1 0.001 1.0000000 358.32 113.4680 71.66400
## 2 0.001 0.9700000 364.48 111.9561 70.70912
## 3 0.001 0.9409000 370.48 110.3851 69.71693
## 4 0.001 0.9126730 376.32 108.7614 68.69142
## 5 0.001 0.8852928 382.00 107.0909 67.63637
## 6 0.001 0.8587340 387.52 105.3793 66.55532
## REV_AD_PLAYED REV_AD_CLICKED COST_SONG_PLAYED
## 1 1.433280 0.7166400 1.134680
## 2 1.414182 0.7070912 1.119561
## 3 1.394339 0.6971693 1.103851
## 4 1.373828 0.6869142 1.087614
## 5 1.352727 0.6763637 1.070909
## 6 1.331106 0.6655532 1.053793
# Define function: calc_business_model
calc_business_model <- function(assumptions, price, print_cost, ship_cost){
cashflow <- assumptions
cashflow$revenue <- cashflow$sales * price
cashflow$direct_expense <- cashflow$sales * (print_cost + ship_cost)
cashflow$gross_profit <- cashflow$revenue - cashflow$direct_expense
cashflow
}
# Call calc_business_model function for different sales prices
assumptions
## year sales
## 1 1 175
## 2 2 200
## 3 3 180
## 4 4 100
## 5 5 50
calc_business_model(assumptions, 20, 0.5, 2)$gross_profit
## [1] 3062.5 3500.0 3150.0 1750.0 875.0
calc_business_model(assumptions, 25, 0.5, 2)$gross_profit
## [1] 3937.5 4500.0 4050.0 2250.0 1125.0
# Inputs
production <- data.frame(Month=1:60, Units=rep(c(60, 50, 40, 30), times=15))
cost <- 100000
life <- 60
salvage <- 10000
# Compute depreciation
production$Depr_Straight <- (cost - salvage)/life
production$Depr_UnitsProd <- (cost - salvage)*(production$Units) / sum(production$Units)
# Plot two depreciation schedules
ggplot(production, aes(x = Month)) +
geom_line(aes(y = Depr_Straight)) +
geom_line(aes(y = Depr_UnitsProd))
# Business model
cashflow
## year sales revenue direct_expense gross_profit
## 1 1 175 3500 437.5 3062.5
## 2 2 200 4000 500.0 3500.0
## 3 3 180 3600 450.0 3150.0
## 4 4 100 2000 250.0 1750.0
## 5 5 50 1000 125.0 875.0
cashflow$revenue <- cashflow$revenue + 2 * cashflow$sales
cashflow$gross_profit <- cashflow$revenue - cashflow$direct_expense
# Income statement
cashflow$depr_sl <- (1000 - 0) / 5
cashflow$operating_profit <- cashflow$gross_profit - cashflow$depr_sl
cashflow$tax <- cashflow$operating_profit * 0.3
cashflow$net_income <- cashflow$operating_profit - cashflow$tax
# Inspect dataset
cashflow
## year sales revenue direct_expense gross_profit depr_sl operating_profit
## 1 1 175 3850 437.5 3412.5 200 3212.5
## 2 2 200 4400 500.0 3900.0 200 3700.0
## 3 3 180 3960 450.0 3510.0 200 3310.0
## 4 4 100 2200 250.0 1950.0 200 1750.0
## 5 5 50 1100 125.0 975.0 200 775.0
## tax net_income
## 1 963.75 2248.75
## 2 1110.00 2590.00
## 3 993.00 2317.00
## 4 525.00 1225.00
## 5 232.50 542.50
# Calculate income statement
assumptions <- data.frame(unit_sales=100000*c(1, 2, 4, 8), machines_purchased=c(1, 1, 2, 4),
depreciation=10000000*c(4, 8, 16, 32)
)
assumptions
## unit_sales machines_purchased depreciation
## 1 1e+05 1 4.0e+07
## 2 2e+05 1 8.0e+07
## 3 4e+05 2 1.6e+08
## 4 8e+05 4 3.2e+08
price_per_unit <- 1000
cogs_per_unit <- 450
labor_per_unit <- 50
income_statement <- assumptions
income_statement$revenue <- income_statement$unit_sales * price_per_unit
income_statement$expenses <- income_statement$unit_sales * (cogs_per_unit + labor_per_unit)
income_statement$earnings <- income_statement$revenue - income_statement$expenses - income_statement$depreciation
# Summarize cumulative earnings
sum(income_statement$earnings)
## [1] 1.5e+08
sum(income_statement$earnings) / sum(income_statement$revenue)
## [1] 0.1
# calculate free cashflow
cashflow <- income_statement
cashflow$operating_cf <- cashflow$earnings + cashflow$depreciation
cashflow$capex <- cashflow$machines_purchased * 160000000
cashflow$free_cf <- cashflow$operating_cf - cashflow$capex
# summarize free cashflow
sum(cashflow$free_cf)
## [1] -5.3e+08
Chapter 2 - Key Financial Concepts (Time is Money)
Time Value of Money:
Different Discount Rates:
Discounting Multiple Cash Flows:
Example code includes:
# Assign input variables
fv <- 100
r <- 0.08
# Calculate PV if receive FV in 1 year
pv_1 <- 100 / (1 + r)**1
pv_1
## [1] 92.59259
# Calculate PV if receive FV in 5 years
pv_5 <- 100 / (1 + r)**5
pv_5
## [1] 68.05832
# Calculate difference
pv_1 - pv_5
## [1] 24.53427
# Define PV function: calc_pv
calc_pv <- function(fv, r, n){
pv <- fv / (1+r)**n
pv
}
# Use PV function for 1 input
calc_pv(100, 0.08, 5)
## [1] 68.05832
# Use PV function for range of inputs
n_range <- 1:10
pv_range <- calc_pv(100, 0.08, n_range)
pv_range
## [1] 92.59259 85.73388 79.38322 73.50299 68.05832 63.01696 58.34904 54.02689
## [9] 50.02490 46.31935
# Calculate present values in dataframe
present_values <- data.frame(n = 1:10) %>% mutate(pv = 100 / (1 + 0.08)**n)
# Plot relationship between time periods versus present value
ggplot(present_values, aes(x = n, y = pv)) +
geom_line() +
geom_label(aes(label = paste0("$",round(pv,0)))) +
ylim(0,100) +
labs(
title = "Discounted Value of $100 by Year Received",
x = "Number of Years in the Future",
y = "Present Value ($)"
)
# Calculate present values over range of time periods and discount rates
present_values <-
expand.grid(n = 1:10, r = seq(0.05,0.12,0.01)) %>%
mutate(pv = calc_pv(100, r, n))
# Plot present value versus time delay with a separate colored line for each rate
ggplot(present_values, aes(x = n, y = pv, col = factor(r))) +
geom_line() +
ylim(0,100) +
labs(
title = "Discounted Value of $100 by Year Received",
x = "Number of Years in the Future",
y = "Present Value ($)",
col = "Discount Rate"
)
# Convert monthly to other time periods
r1_mth <- 0.005
r1_quart <- (1 + r1_mth)^3 - 1
r1_semi <- (1 + r1_mth)^6 - 1
r1_ann <- (1 + r1_mth)^12 - 1
# Convert years to other time periods
r2_ann <- 0.08
r2_mth <- (1 + r2_ann)^(1/12) - 1
r2_quart <- (1 + r2_ann)^(1/4) - 1
# Convert real to nominal
r1_real <- 0.08
inflation1 <- 0.03
(r1_nom <- (1 + r1_real) * (1 + inflation1) - 1)
## [1] 0.1124
# Convert nominal to real
r2_nom <- 0.2
inflation2 <- 0.05
(r2_real <- (1 + r2_nom) / (1 + inflation2) - 1)
## [1] 0.1428571
# Define cashflows
cashflow_a <- c(5000, rep(0,6))
cashflow_b <- c(0, rep(1000, 6))
# Calculate pv for each time period
disc_cashflow_a <- calc_pv(cashflow_a, 0.06, 0:6)
disc_cashflow_b <- calc_pv(cashflow_b, 0.06, 0:6)
# Calculate and report total present value for each option
(pv_a <- sum(disc_cashflow_a))
## [1] 5000
(pv_b <- sum(disc_cashflow_b))
## [1] 4917.324
# Define cashflows
cashflow_old <- rep(-500, 11)
cashflow_new <- c(-2200, rep(-300, 10))
options <- data.frame(time = rep(0:10, 2),
option = c(rep("Old",11), rep("New",11)),
cashflow = c(cashflow_old, cashflow_new)
)
# Calculate total expenditure with and without discounting
options %>%
group_by(option) %>%
summarize(sum_cashflow = sum(cashflow),
sum_disc_cashflow = sum(calc_pv(cashflow, 0.12, 0:10))
)
## # A tibble: 2 x 3
## option sum_cashflow sum_disc_cashflow
## <fct> <dbl> <dbl>
## 1 New -5200 -3895.
## 2 Old -5500 -3325.
Chapter 3 - Prioritizing Profitability (Financial Metrics)
Profitability Metrics and Payback Period:
NPV, IRR, Profitability Index:
Terminal Value:
Comparing and Computing Metrics:
Wrap up:
Example code includes:
cashflows <- c(-50000, 1000, 5000, 5000, 5000, 10000, 10000, 10000, 10000, 10000, 10000)
# Inspect variables
cashflows
## [1] -50000 1000 5000 5000 5000 10000 10000 10000 10000 10000
## [11] 10000
# Calculate cumulative cashflows
cum_cashflows <- cumsum(cashflows)
# Identify payback period
payback_period <- min(which(cum_cashflows >= 0)) - 1
# View result
payback_period
## [1] 8
# Define payback function: calc_payback
calc_payback <- function(cashflows) {
cum_cashflows <- cumsum(cashflows)
payback_period <- min(which(cum_cashflows >= 0)) - 1
payback_period
}
# Test out our function
cashflows <- c(-100, 50, 50, 50)
calc_payback(cashflows) == 2
## [1] TRUE
cashflows <- c(-50000, 1000, 5000, 5000, 5000, 10000, 10000, 10000, 10000, 10000, 10000)
# normal payback period
payback_period <- calc_payback(cashflows)
# discounted payback period
discounted_cashflows <- calc_pv(cashflows, r = 0.06, n = 0:(length(cashflows)-1) )
payback_period_disc <- calc_payback(discounted_cashflows)
# compare results
payback_period
## [1] 8
payback_period_disc
## [1] 10
# Define NPV function: calc_npv
calc_npv <- function(cashflows, r) {
n <- 0:(length(cashflows) - 1)
npv <- sum( calc_pv(cashflows, r, n) )
npv
}
# The base R function stats::uniroot can help find values between a lower bound (lower) and an upper bound (upper) where the value of a function is zero
# This can help us calculate the internal rate of return (IRR) for which NPV = 0.
# Define IRR function: calc_irr
calc_irr <- function(cashflows) {
uniroot(calc_npv,
interval = c(0, 1),
cashflows = cashflows)$root
}
# Try out function on valid input
cashflows <- c(-100, 20, 20, 20, 20, 20, 20, 10, 5)
calc_irr(cashflows)
## [1] 0.08296991
# Define profitability index function: calc_profitability_index
calc_profitability_index <- function(init_investment, future_cashflows, r) {
discounted_future_cashflows <- calc_npv(future_cashflows, r)
discounted_future_cashflows / abs(init_investment)
}
# Try out function on valid input
init_investment <- -100
cashflows <- c(0, 20, 20, 20, 20, 20, 20, 10, 5)
calc_profitability_index(init_investment, cashflows, 0.08)
## [1] 1.009938
# pull last year cashflow from vector of cashflows
last_year_cashflow <- cashflow[length(cashflow)]
last_period_n <- length(cashflow) - 1
# calculate terminal value for different discount raes
terminal_value_1 <- last_year_cashflow / ((0.15 - 0.1)*(1 + 0.15)^last_period_n)
terminal_value_2 <- last_year_cashflow / ((0.15 - 0.01)*(1 + 0.15)^last_period_n)
terminal_value_3 <- last_year_cashflow / ((0.15 + 0.05)*(1 + 0.15)^last_period_n)
# inspect results
terminal_value_1
## free_cf
## 1 -719183902
## 2 -392282129
## 3 -784564257
## 4 -1569128514
terminal_value_2
## free_cf
## 1 -256851394
## 2 -140100760
## 3 -280201520
## 4 -560403041
terminal_value_3
## free_cf
## 1 -179795976
## 2 -98070532
## 3 -196141064
## 4 -392282129
cashflow1 <- c(-50000, 100, 2000, 2000, 5000, 10000, 10000, 10000, 10000, 10000, 10000)
cashflow2 <- c(-1e+05, 20000, 20000, 20000, 20000, 20000)
cashflow3 <- c(-8000, 6000, 5000, 4000, 3000, 2000, 1000, 0)
# calculate internal rate of return (IRR) for each stream of cashflows
r1 <- calc_irr(cashflow1)
r2 <- calc_irr(cashflow2)
r3 <- calc_irr(cashflow3)
# calculate net present value (NPV) for each stream of cashflows, assuming r = irr
npv1 <- calc_npv(cashflow1, r1)
npv2 <- calc_npv(cashflow2, r2)
npv3 <- calc_npv(cashflow3, r3)
# examine results
npv1
## [1] -5.859804
npv2
## [1] 0
npv3
## [1] -0.1359058
cf1 <- c(-5000, 450, 450, 450, 450, 450, 450, 450, 450, 450, 450)
cf2 <- c(-5000, 2000, 2000, 2000, 2000, 2000, 2000, -2000, -2000, -2000, -2000)
rates <- c(0, 0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.035, 0.04, 0.045, 0.05, 0.055, 0.06, 0.065, 0.07, 0.075, 0.08, 0.085, 0.09, 0.095, 0.1, 0.105, 0.11, 0.115, 0.12, 0.125, 0.13, 0.135, 0.14, 0.145, 0.15, 0.155, 0.16, 0.165, 0.17, 0.175, 0.18, 0.185, 0.19, 0.195, 0.2, 0.205, 0.21, 0.215, 0.22, 0.225, 0.23, 0.235, 0.24, 0.245, 0.25)
# create dataset of NPV for each cashflow and rate
npv_by_rates <- data.frame(rates) %>%
group_by(rates) %>%
mutate(npv1 = calc_npv(cf1, rates), npv2 = calc_npv(cf2, rates))
# plot cashflows over different discount rates
ggplot(npv_by_rates, aes(x = rates, y = npv1)) +
geom_line() +
geom_line(aes(y = npv2)) +
labs(title = "NPV by Discount Rate", subtitle = "A Tale of Two Troubling Cashflows",
y = "NPV ($)",x = "Discount Rate (%)"
) +
annotate("text", x = 0.2, y = -500, label = "Two break-even points") +
annotate("text", x = 0.2, y = -2500, label = "No break-even point")
cashflows <- data.frame(option=rep(1:4, each=11), time=rep(0:10, times=4),
cashflow=c(-10, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, -1000, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, -1e+05, 20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000, -10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
)
# calculate summary metrics
cashflow_comparison <-
cashflows %>%
group_by(option) %>%
summarize(npv = calc_npv(cashflow, 0.1), irr = calc_irr(cashflow))
# inspect output
cashflow_comparison
## # A tibble: 4 x 3
## option npv irr
## <int> <dbl> <dbl>
## 1 1 14.6 0.385
## 2 2 843. 0.273
## 3 3 22891. 0.151
## 4 4 -3.86 0
# visualize summary metrics
ggplot(cashflow_comparison, aes(x = npv, y = irr, col = factor(option))) +
geom_point(size = 5) +
geom_hline(yintercept = 0.1) +
scale_y_continuous(label = scales::percent) +
scale_x_continuous(label = scales::dollar) +
labs(title = "NPV versus IRR for Project Alternatives",
subtitle = "NPV calculation assumes 10% discount rate",
caption = "Line shows actual discount rate to asses IRR break-even",
x = "NPV ($)", y = "IRR (%)",col = "Option"
)
Chapter 4 - Understanding Outcomes
Building a Business Case:
Scenario Analysis with tidyr and purrr:
Sensitivity analysis:
Communicating Cashflow Concepts:
Advanced Topics in Cashflow Modeling:
Example code includes:
assumptions <- data.frame(year=0:10,
unit_sales_per_day=c(0, 10, 12, 14, 15, 16, 17, 18, 18, 18, 18),
capex=c(5000, rep(0, 10)),
pct_cannibalization=c(0, rep(0.25, 10)),
maintenance_cost=c(0, rep(250, 10)),
depreciation_cost=c(0, rep(500, 10)),
profit_margin_per_nitro=3,
profit_margin_per_regular=1,
labor_cost_per_hour=8,
days_open_per_year=250
)
# Check the first few rows of the data
head(assumptions)
## year unit_sales_per_day capex pct_cannibalization maintenance_cost
## 1 0 0 5000 0.00 0
## 2 1 10 0 0.25 250
## 3 2 12 0 0.25 250
## 4 3 14 0 0.25 250
## 5 4 15 0 0.25 250
## 6 5 16 0 0.25 250
## depreciation_cost profit_margin_per_nitro profit_margin_per_regular
## 1 0 3 1
## 2 500 3 1
## 3 500 3 1
## 4 500 3 1
## 5 500 3 1
## 6 500 3 1
## labor_cost_per_hour days_open_per_year
## 1 8 250
## 2 8 250
## 3 8 250
## 4 8 250
## 5 8 250
## 6 8 250
# Check the variable names of the data
names(assumptions)
## [1] "year" "unit_sales_per_day"
## [3] "capex" "pct_cannibalization"
## [5] "maintenance_cost" "depreciation_cost"
## [7] "profit_margin_per_nitro" "profit_margin_per_regular"
## [9] "labor_cost_per_hour" "days_open_per_year"
# Plot the trend of unit_sales_per_day by year
ggplot(assumptions, aes(x = year, y = unit_sales_per_day)) +
geom_line()
tax_rate <- 0.36
# Create the cashflow_statement dataframe
cashflow_statement <-
mutate(assumptions,
sales_per_year = unit_sales_per_day * days_open_per_year,
sales_revenue = sales_per_year * profit_margin_per_nitro,
total_revenue = sales_revenue,
labor_cost = days_open_per_year * 0.5 * labor_cost_per_hour,
cannibalization_cost = sales_per_year * pct_cannibalization * profit_margin_per_regular,
direct_expense = labor_cost + cannibalization_cost + maintenance_cost,
gross_profit = total_revenue - direct_expense,
operating_income = gross_profit - depreciation_cost,
net_income = operating_income * (1 - tax_rate),
cashflow = net_income + depreciation_cost - capex
)
# build individual scenarios
optimist <- mutate(assumptions, unit_sales_per_day = unit_sales_per_day * 1.2, pct_cannibalization = 0.1)
pessimist <- mutate(assumptions, unit_sales_per_day = unit_sales_per_day * 0.8, profit_margin_per_nitro = 1)
# combine into one dataset
scenarios <-
bind_rows(
mutate(pessimist, scenario = "pessimist"),
mutate(assumptions, scenario = "realist"),
mutate(optimist, scenario = "optimist")
)
calc_model <- function(assumptions){
mutate( assumptions,
sales_per_year = unit_sales_per_day * days_open_per_year,
sales_revenue = sales_per_year * profit_margin_per_nitro,
total_revenue = sales_revenue,
labor_cost = days_open_per_year * 0.5 * labor_cost_per_hour,
cannibalization_cost = sales_per_year * pct_cannibalization * profit_margin_per_regular,
direct_expense = labor_cost + cannibalization_cost + maintenance_cost,
gross_profit = total_revenue - direct_expense,
operating_income = gross_profit - depreciation_cost,
net_income = operating_income * (1 - 0.36),
cashflow = net_income + depreciation_cost - capex
)
}
calc_npv_from_cashflow <- function(cashflow, r){
cashflow_line <- cashflow$cashflow
sum(calc_pv(cashflow_line, r, 0:(length(cashflow_line)-1)))
}
# calculate scenario NPVs
scenario_analysis <- scenarios %>%
nest(-scenario) %>%
mutate(cashflow = map(data, calc_model)) %>%
mutate(npv = map_dbl(cashflow, calc_npv_from_cashflow, 0.2))
## Warning: All elements of `...` must be named.
## Did you want `data = c(year, unit_sales_per_day, capex, pct_cannibalization, maintenance_cost,
## depreciation_cost, profit_margin_per_nitro, profit_margin_per_regular,
## labor_cost_per_hour, days_open_per_year)`?
# inspect results
select(scenario_analysis, scenario, npv)
## # A tibble: 3 x 2
## scenario npv
## <chr> <dbl>
## 1 pessimist -2505.
## 2 realist 18042.
## 3 optimist 25019.
# scenario analysis bar chart
ggplot(data = scenario_analysis, aes(x = scenario, y = npv, fill = scenario)) +
geom_bar(stat = "identity") +
scale_y_continuous(labels = scales::dollar) +
labs(title = "NPV Scenario Analysis of Nitro Coffee Expansion") +
guides(fill = FALSE)
# define sensitivity factor function
factor_data <- function(data, metric, factor){
data[[metric]] <- data[[metric]] * factor
data
}
# create sensitivity analysis
sensitivity <-
expand.grid(
factor = seq(0.5,1.5,0.1),
metric = c("profit_margin_per_nitro", "labor_cost_per_hour", "pct_cannibalization", "unit_sales_per_day")) %>%
mutate(scenario = map2(metric, factor, ~factor_data(assumptions, .x, .y))) %>%
mutate(cashflow = map(scenario, calc_model)) %>%
mutate(npv = map_dbl(cashflow, calc_npv_from_cashflow, r = 0.2))
ggplot(sensitivity,
aes(x = factor, y = npv, col = metric)
) +
geom_line() +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::dollar) +
labs(
title = "Sensivity Analysis",
x = "Factor on Original Assumption",
y = "Projected NPV",
col = "Metric"
)
# examine current cashflow strucutre
tidy_cashflow <- data.frame(Month=1:6,
Received=c(100, 200, 300, 400, 500, 500),
Spent=c(150, 175, 200, 225, 250, 250)
)
# create long_cashflow with gather
# long_cashflow <- tidyr::gather(cashflow, key = Month, value = Value, -Metric)
# create tidy_cashflow with spread
# tidy_cashflow <- tidyr::spread(long_cashflow, key = Metric, value = Value)
# examine results
tidy_cashflow
## Month Received Spent
## 1 1 100 150
## 2 2 200 175
## 3 3 300 200
## 4 4 400 225
## 5 5 500 250
## 6 6 500 250
# create long_cashflow with gather
long_cashflow <- tidyr::gather(tidy_cashflow, key = Metric, value = Value, -Month)
# create untidy_cashflow with spread
untidy_cashflow <- tidyr::spread(long_cashflow, key = Month, value = Value)
# examine results
untidy_cashflow
## Metric 1 2 3 4 5 6
## 1 Received 100 200 300 400 500 500
## 2 Spent 150 175 200 225 250 250
gross_profit_summary <- data.frame(metric=c("Sales Revenue", "Keg Cost", "Cannibalization Cost", "Labor Cost", "Maintenance Cost"),
value=c(187200, -78240, -31200, -10000, -2500)
)
# compute min and maxes for each line item
waterfall_items <-
mutate(gross_profit_summary,
end = cumsum(value),
start = lag(cumsum(value), 1, default = 0))
# compute totals row for waterfall metrics
waterfall_summary <-
data.frame(metric = "Gross Profit",
end = sum(gross_profit_summary$value),
start = 0)
# combine line items with summary row
waterfall_data <-
bind_rows(waterfall_items, waterfall_summary) %>%
mutate(row_num = row_number())
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
# Plot waterfall diagram
ggplot(waterfall_data, aes(fill = (end > start))) +
geom_rect(aes(xmin = row_num - 0.25, xmax = row_num + 0.25,
ymin = start, ymax = end)) +
geom_hline(yintercept = 0) +
scale_x_continuous(breaks = waterfall_data$row_num, labels = waterfall_data$metric) +
# Styling provided for you - check out a ggplot course for more information!
scale_y_continuous(labels = scales::dollar) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_blank()) +
guides(fill = FALSE) +
labs(
title = "Gross Profit for Proposed Nitro Coffee Expansion",
subtitle = "Based on pro forma 10-year forecast")
Chapter 1 - General Strategies for Visualizing Big Data
Visualizing summaries:
Adding more detail to summaries:
Visualizing subsets:
Visualizing all subsets:
Example code includes:
load("./RInputFiles/tx_sub.RData")
glimpse(tx)
## Observations: 1,000,000
## Variables: 7
## $ pick_day <date> 2016-07-09, 2016-07-28, 2016-07-20, 2016-07-30,...
## $ pick_dow <fct> Sat, Thu, Wed, Sat, Tue, Thu, Fri, Sun, Mon, Thu...
## $ total_amount <dbl> 47.60, 9.96, 6.80, 11.75, 7.30, 12.05, 13.80, 14...
## $ tip_amount <dbl> 23.80, 1.66, 1.00, 1.95, 0.00, 2.75, 0.00, 2.36,...
## $ payment_type <fct> Card, Card, Card, Card, Cash, Card, Cash, Card, ...
## $ trip_duration <dbl> 26.116667, 5.866667, 4.916667, 10.350000, 6.8666...
## $ pick_wkday <lgl> FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, FALS...
tx <- tx %>%
rename(pickup_date=pick_day, pickup_dow=pick_dow)
glimpse(tx)
## Observations: 1,000,000
## Variables: 7
## $ pickup_date <date> 2016-07-09, 2016-07-28, 2016-07-20, 2016-07-30,...
## $ pickup_dow <fct> Sat, Thu, Wed, Sat, Tue, Thu, Fri, Sun, Mon, Thu...
## $ total_amount <dbl> 47.60, 9.96, 6.80, 11.75, 7.30, 12.05, 13.80, 14...
## $ tip_amount <dbl> 23.80, 1.66, 1.00, 1.95, 0.00, 2.75, 0.00, 2.36,...
## $ payment_type <fct> Card, Card, Card, Card, Cash, Card, Cash, Card, ...
## $ trip_duration <dbl> 26.116667, 5.866667, 4.916667, 10.350000, 6.8666...
## $ pick_wkday <lgl> FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, FALS...
# Summarize taxi ride count by pickup day
daily_count <- tx %>%
group_by(pickup_date) %>%
summarise(n_rides=n())
# Create a line plot
ggplot(daily_count, aes(x=pickup_date, y=n_rides)) +
geom_line()
# Create a histogram of total_amount
ggplot(tx, aes(x=total_amount)) +
geom_histogram() +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 58 rows containing non-finite values (stat_bin).
# Create a bar chart of payment_type
ggplot(tx, aes(x=payment_type)) +
geom_bar()
# Create a hexagon-binned plot of total_amount vs. trip_duration
ggplot(tx, aes(x=trip_duration, y=total_amount)) +
geom_hex(bins=75) +
scale_x_log10() +
scale_y_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning in self$trans$transform(x): Transformation introduced infinite
## values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 1058 rows containing non-finite values (stat_binhex).
# Summarize taxi rides count by payment type, pickup date, pickup day of week, and payment type
daily_count <- tx %>%
filter(payment_type %in% c("Card", "Cash")) %>%
group_by(payment_type, pickup_date, pickup_dow) %>%
summarise(n_rides=n())
# Plot the data
ggplot(daily_count, aes(x=pickup_date, y=n_rides)) +
geom_point() +
facet_grid(payment_type ~ pickup_dow) +
coord_fixed(ratio = 0.4)
# Histogram of the tip amount faceted on payment type
ggplot(tx, aes(x=tip_amount+0.01)) +
geom_histogram() +
scale_x_log10() +
facet_wrap(~ payment_type, ncol=1, scales="free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Get data ready to plot
amount_compare <- tx %>%
mutate(total_no_tip = total_amount - tip_amount) %>%
filter(total_no_tip <= 20) %>%
sample_n(size=round(nrow(.)/20), replace=FALSE) %>%
select(total_amount, total_no_tip, payment_type) %>%
tidyr::gather(amount_type, amount, -payment_type)
# Quantile plot
ggplot(amount_compare, aes(sample=amount, color=payment_type)) +
geom_qq(distribution=stats::qunif, shape = 21) +
facet_wrap(~ amount_type) +
ylim(c(3, 20))
## Warning: Removed 2202 rows containing missing values (geom_point).
Chapter 2 - ggplot2 + Trelliscope JS
Faceting with Trelliscope JS:
Interacting with Trelliscope JS displays:
Additional Trelliscope JS features:
Adding your own cognostics:
Example code includes:
library(trelliscopejs)
data(gapminder, package="gapminder")
glimpse(gapminder)
# Create the plot
gapminder %>%
filter(continent=="Oceania") %>%
ggplot(aes(x=year, y=lifeExp)) +
geom_line() +
# Facet on country and continent
facet_trelliscope(~ country + continent, data=gapminder)
gapminder %>%
filter(continent=="Oceania") %>%
ggplot(aes(x=year, y=lifeExp)) +
geom_line() +
facet_trelliscope(~ country + continent, name = "lifeExp_by_country",
desc = "Life expectancy vs. year per country", nrow = 1, ncol = 2
)
# Create the plot
gapminder %>%
filter(continent=="Oceania") %>%
ggplot(aes(x=year, y=lifeExp)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
facet_trelliscope(~ country + continent, name = "lifeExp_by_country",
desc = "Life expectancy vs. year for 142 countries.",
nrow = 2, ncol = 3,
# Set the scales
scales="sliced",
# Specify automatic cognistics
auto_cog=TRUE
)
# Group by country and create the two new variables
gap <- gapminder %>%
filter(continent=="Oceania") %>%
group_by(country) %>%
mutate(delta_lifeExp = tail(lifeExp, 1) - head(lifeExp, 1),
ihme_link = paste0("http://www.healthdata.org/", country)
)
# Add the description
gap$delta_lifeExp <- cog(gap$delta_lifeExp, desc = "Overall change in life expectancy")
# Specify the default label
gap$ihme_link <- cog(gap$ihme_link, default_label = TRUE)
ggplot(gap, aes(year, lifeExp)) +
geom_point() +
facet_trelliscope(~ country + continent,
name = "lifeExp_by_country",
desc = "Life expectancy vs. year for 142 countries.",
nrow = 2, ncol = 3, scales = c("same", "sliced")
)
Chapter 3 - Trelliscope in the Tidyverse
Trelliscope in the Tidyverse:
Cognostics:
Trelliscope options:
Visualizing databases of images:
download.file(url, destfile = file.path(path, basename(url))) Example code includes:
# do not have dataset 'stocks'
by_symbol <- stocks %>%
group_by(symbol) %>%
nest()
min_volume_fn <- function(x) min(x$volume)
# Create new column
by_symbol_min <- by_symbol %>%
mutate(min_volume = map_dbl(data, min_volume_fn))
ohlc_plot <- function(d) {
plot_ly(d, x = ~date, type = "ohlc",
open = ~open, close = ~close,
high = ~high, low = ~low)
}
by_symbol_plot <- mutate(by_symbol, panel=map_plot(data, ohlc_plot))
trelliscope(by_symbol_plot, name="ohlc_top500")
# Create market_cap_log
by_symbol <- mutate(by_symbol,
market_cap_log = cog(
val = log10(market_cap),
desc = "log base 10 market capitalization"
)
)
annual_return <- function(x)
100 * (tail(x$close, 1) - head(x$open, 1)) / head(x$open, 1)
# Compute by_symbol_avg
by_symbol_avg <- mutate(by_symbol,
stats = map(data, function(x) {
data_frame(
mean_close = mean(x$close),
mean_volume = mean(x$volume),
annual_return = annual_return(x)
)
}
)
)
# Create the trelliscope display
p <- trelliscope(by_symbol, width=600, height=300, name="ohlc_top500", desc="Example aspect 2 plot")
pokemon %>%
# Reduce the variables in the dataset
select(pokemon, type_1, attack, generation_id, url_image) %>%
mutate(
# Respecify pokemon
pokemon = cog(pokemon, default_label=TRUE),
# Create panel variable
panel = img_panel(url_image)
) %>%
# Create the display
trelliscope(name="pokemon", nrow=3, ncol=6)
Chapter 4 - Case Study: Exploring Montreal BIXI Bike Data
Montreal BIXI Bike Data:
Summary Visualization Recap:
Top 100 routes dataset:
Wrap up:
Example code includes:
# DO NOT HAVE FULL BIXI Data
bike04 <- read_csv("./RInputFiles/BIXIData/OD_2017-04.csv")
## Parsed with column specification:
## cols(
## start_date = col_datetime(format = ""),
## start_station_code = col_integer(),
## end_date = col_datetime(format = ""),
## end_station_code = col_integer(),
## duration_sec = col_integer(),
## is_member = col_integer()
## )
bike05 <- read_csv("./RInputFiles/BIXIData/OD_2017-05.csv")
## Parsed with column specification:
## cols(
## start_date = col_datetime(format = ""),
## start_station_code = col_integer(),
## end_date = col_datetime(format = ""),
## end_station_code = col_integer(),
## duration_sec = col_integer(),
## is_member = col_integer()
## )
bike06 <- read_csv("./RInputFiles/BIXIData/OD_2017-06.csv")
## Parsed with column specification:
## cols(
## start_date = col_datetime(format = ""),
## start_station_code = col_integer(),
## end_date = col_datetime(format = ""),
## end_station_code = col_integer(),
## duration_sec = col_integer(),
## is_member = col_integer()
## )
bike07 <- read_csv("./RInputFiles/BIXIData/OD_2017-07.csv")
## Parsed with column specification:
## cols(
## start_date = col_datetime(format = ""),
## start_station_code = col_integer(),
## end_date = col_datetime(format = ""),
## end_station_code = col_integer(),
## duration_sec = col_integer(),
## is_member = col_integer()
## )
bike08 <- read_csv("./RInputFiles/BIXIData/OD_2017-08.csv")
## Parsed with column specification:
## cols(
## start_date = col_datetime(format = ""),
## start_station_code = col_integer(),
## end_date = col_datetime(format = ""),
## end_station_code = col_integer(),
## duration_sec = col_integer(),
## is_member = col_integer()
## )
bike09 <- read_csv("./RInputFiles/BIXIData/OD_2017-09.csv")
## Parsed with column specification:
## cols(
## start_date = col_datetime(format = ""),
## start_station_code = col_integer(),
## end_date = col_datetime(format = ""),
## end_station_code = col_integer(),
## duration_sec = col_integer(),
## is_member = col_integer()
## )
bike10 <- read_csv("./RInputFiles/BIXIData/OD_2017-10.csv")
## Parsed with column specification:
## cols(
## start_date = col_datetime(format = ""),
## start_station_code = col_integer(),
## end_date = col_datetime(format = ""),
## end_station_code = col_integer(),
## duration_sec = col_integer(),
## is_member = col_integer()
## )
bike11 <- read_csv("./RInputFiles/BIXIData/OD_2017-11.csv")
## Parsed with column specification:
## cols(
## start_date = col_datetime(format = ""),
## start_station_code = col_integer(),
## end_date = col_datetime(format = ""),
## end_station_code = col_integer(),
## duration_sec = col_integer(),
## is_member = col_integer()
## )
stations <- read_csv("./RInputFiles/BIXIData/Stations_2017.csv")
## Parsed with column specification:
## cols(
## code = col_integer(),
## name = col_character(),
## latitude = col_double(),
## longitude = col_double()
## )
bike <- rbind(bike04, bike05, bike06, bike07, bike08, bike09, bike10, bike11) %>%
mutate(membership=factor(is_member, levels=c(1, 0), labels=c("member", "non-member")),
start_day=as.Date(start_date),
start_dow=factor(lubridate::wday(start_date), levels=1:7, labels=c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")),
weekday=factor(ifelse(start_dow %in% c("Sat", "Sun"), 2, 1), levels=1:2, labels=c("workweek", "weekend")),
start_hod=lubridate::hour(start_date),
start_mon=lubridate::month(start_date),
start_wk=lubridate::week(start_date)
)
glimpse(bike)
## Observations: 4,740,357
## Variables: 13
## $ start_date <dttm> 2017-04-15 00:00:00, 2017-04-15 00:01:00, ...
## $ start_station_code <int> 7060, 6173, 6203, 6104, 6174, 6719, 6223, 6...
## $ end_date <dttm> 2017-04-15 00:31:00, 2017-04-15 00:10:00, ...
## $ end_station_code <int> 7060, 6173, 6204, 6114, 6174, 6354, 6148, 6...
## $ duration_sec <int> 1841, 553, 195, 285, 569, 620, 679, 311, 21...
## $ is_member <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ membership <fct> member, member, member, member, member, mem...
## $ start_day <date> 2017-04-15, 2017-04-15, 2017-04-15, 2017-0...
## $ start_dow <fct> Sat, Sat, Sat, Sat, Sat, Sat, Sat, Sat, Sat...
## $ weekday <fct> weekend, weekend, weekend, weekend, weekend...
## $ start_hod <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ start_mon <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4...
## $ start_wk <dbl> 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,...
# Compute daily counts
daily <- bike %>%
group_by(start_day, weekday) %>%
summarise(n = n())
# Plot the result
ggplot(daily, aes(x=start_day, y=n, color=weekday)) +
geom_point()
# Compute week_hod
week_hod <- bike %>%
group_by(start_wk, start_hod, weekday) %>%
summarize(n=n())
# Plot the result
ggplot(week_hod, aes(x=start_wk, y=n, color=weekday)) +
geom_point() +
facet_grid(cols=vars(start_hod)) +
scale_y_sqrt()
# Compute wk_memb_hod
wk_memb_hod <- bike %>%
group_by(start_wk, start_hod, weekday, membership) %>%
summarize(n=n())
# Plot the result
ggplot(wk_memb_hod, aes(x=start_wk, y=n, color=weekday)) +
geom_point() +
facet_grid(membership ~ start_hod) +
scale_y_sqrt()
# Compute daily_may
daily_may <- bike %>%
filter(start_mon == 5) %>%
group_by(start_day, start_hod, membership) %>%
summarise(n = n())
# Plot the result
ggplot(daily_may, aes(x=start_hod, y=n, color=membership)) +
geom_point() +
facet_wrap(~ start_day, ncol=7)
# ggplot(daily_may, aes(x=start_hod, y=n, color = membership)) +
# geom_point() +
# Facet on start_day
# facet_trelliscope(~ start_day, nrow=2, ncol=7)
# Function to construct a Google maps URL with cycling directions
make_gmap_url <- function(start_lat, start_lon, end_lat, end_lon) {
paste0("https://www.google.com/maps/dir/?api=1",
"&origin=", start_lat, ",", start_lon,
"&destination=", end_lat, ",", end_lon,
"&travelmode=bicycling")
}
load("./RInputFiles/route_hod.RData")
glimpse(route_hod)
## Observations: 4,114
## Variables: 11
## $ start_station_code <int> 6012, 6012, 6012, 6012, 6012, 6012, 6012, 6...
## $ end_station_code <int> 6015, 6015, 6015, 6015, 6015, 6015, 6015, 6...
## $ start_hod <dbl> 0, 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 8, 9...
## $ weekday <fct> workweek, weekend, workweek, weekend, workw...
## $ n <int> 12, 13, 11, 2, 2, 6, 3, 3, 1, 1, 2, 18, 1, ...
## $ start_station_name <chr> "Métro St-Laurent (de Maisonneuve / St-Laur...
## $ start_lat <dbl> 45.51066, 45.51066, 45.51066, 45.51066, 45....
## $ start_lon <dbl> -73.56497, -73.56497, -73.56497, -73.56497,...
## $ end_station_name <chr> "Berri / de Maisonneuve", "Berri / de Maiso...
## $ end_lat <dbl> 45.5153, 45.5153, 45.5153, 45.5153, 45.5153...
## $ end_lon <dbl> -73.56127, -73.56127, -73.56127, -73.56127,...
# Compute tot_rides, weekday_diff, and map_url
route_hod_updated <- route_hod %>%
group_by(start_station_code, end_station_code) %>%
mutate(
tot_rides = sum(n),
weekday_diff = mean(n[weekday == "workweek"]) - mean(n[weekday == "weekend"]),
map_url = make_gmap_url(start_lat, start_lon, end_lat, end_lon))
# Create the plot
# ggplot(route_hod, aes(x=start_hod, y=n, color=weekday)) +
# geom_point(size=3) +
# facet_trelliscope(~start_station_name + end_station_name, nrow=2, ncol=4) +
# theme(legend.position = "none")
Chapter 1 - Proportions of a Whole
Course/Grammar of Graphics Information:
Pie Chart and Friends:
When to use Bars:
Example code includes:
who_disease <- readr::read_csv("./RInputFiles/who_disease.csv")
## Parsed with column specification:
## cols(
## region = col_character(),
## countryCode = col_character(),
## country = col_character(),
## disease = col_character(),
## year = col_double(),
## cases = col_double()
## )
glimpse(who_disease)
## Observations: 43,262
## Variables: 6
## $ region <chr> "EMR", "EUR", "AFR", "EUR", "AFR", "AMR", "AMR", "EUR",...
## $ countryCode <chr> "AFG", "ALB", "DZA", "AND", "AGO", "ATG", "ARG", "ARM",...
## $ country <chr> "Afghanistan", "Albania", "Algeria", "Andorra", "Angola...
## $ disease <chr> "measles", "measles", "measles", "measles", "measles", ...
## $ year <dbl> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2...
## $ cases <dbl> 638, 17, 41, 0, 53, 0, 0, 2, 99, 27, 0, 0, 0, 972, 0, 1...
# set x aesthetic to region column
ggplot(who_disease, aes(x=region)) +
geom_bar()
# filter data to AMR region.
amr_region <- who_disease %>%
filter(region=="AMR")
# map x to year and y to cases.
ggplot(amr_region, aes(x=year, y=cases)) +
# lower alpha to 0.5 to see overlap.
geom_point(alpha=0.5)
# Wrangle data into form we want.
disease_counts <- who_disease %>%
mutate(disease = ifelse(disease %in% c('measles', 'mumps'), disease, 'other')) %>%
group_by(disease) %>%
summarise(total_cases = sum(cases))
ggplot(disease_counts, aes(x = 1, y = total_cases, fill = disease)) +
# Use a column geometry.
geom_col() +
# Change coordinate system to polar and set theta to 'y'.
coord_polar(theta="y")
ggplot(disease_counts, aes(x = 1, y = total_cases, fill = disease)) +
# Use a column geometry.
geom_col() +
# Change coordinate system to polar.
coord_polar(theta = "y") +
# Clean up the background with theme_void and give it a proper title with ggtitle.
theme_void() +
ggtitle('Proportion of diseases')
disease_counts <- who_disease %>%
group_by(disease) %>%
summarise(total_cases = sum(cases)) %>%
mutate(percent = round(total_cases/sum(total_cases)*100))
# Create an array of rounded percentages for diseases.
case_counts <- disease_counts$percent
# Name the percentage array with disease_counts$disease
names(case_counts) <- disease_counts$disease
# Pass case_counts vector to the waffle function to plot
waffle::waffle(case_counts)
disease_counts <- who_disease %>%
mutate(disease = ifelse(disease %in% c('measles', 'mumps'), disease, 'other')) %>%
group_by(disease, year) %>% # note the addition of year to the grouping.
summarise(total_cases = sum(cases))
# add the mapping of year to the x axis.
ggplot(disease_counts, aes(x=year, y = total_cases, fill = disease)) +
# Change the position argument to make bars full height
geom_col(position="fill")
disease_counts <- who_disease %>%
mutate(
disease = ifelse(disease %in% c('measles', 'mumps'), disease, 'other') %>%
factor(levels=c('measles', 'other', 'mumps')) # change factor levels to desired ordering
) %>%
group_by(disease, year) %>%
summarise(total_cases = sum(cases))
# plot
ggplot(disease_counts, aes(x = year, y = total_cases, fill = disease)) +
geom_col(position = 'fill')
disease_counts <- who_disease %>%
# Filter to on or later than 1999
filter(year >= 1999) %>%
mutate(disease = ifelse(disease %in% c('measles', 'mumps'), disease, 'other')) %>%
group_by(disease, region) %>% # Add region column to grouping
summarise(total_cases = sum(cases))
# Set aesthetics so disease is the stacking variable, region is the x-axis and counts are the y
ggplot(disease_counts, aes(x=region, y=total_cases, fill=disease)) +
# Add a column geometry with the proper position value.
geom_col(position="fill")
Chapter 2 - Point Data
Point Data:
Point Charts:
Tuning Charts:
Example code includes:
who_disease %>%
# filter to india in 1980
filter(country=="India", year==1980) %>%
# map x aesthetic to disease and y to cases
ggplot(aes(x=disease, y=cases)) +
# use geom_col to draw
geom_col()
who_disease %>%
# filter data to observations of greater than 1,000 cases
filter(cases > 1000) %>%
# map the x-axis to the region column
ggplot(aes(x=region)) +
# add a geom_bar call
geom_bar()
interestingCountries <- c('NGA', 'SDN', 'FRA', 'NPL', 'MYS', 'TZA', 'YEM', 'UKR', 'BGD', 'VNM')
who_subset <- who_disease %>%
filter(countryCode %in% interestingCountries, disease == 'measles', year %in% c(1992, 2002)) %>%
mutate(year = paste0('cases_', year)) %>%
spread(year, cases)
# Reorder y axis and change the cases year to 1992
ggplot(who_subset, aes(x = log10(cases_1992), y = reorder(country, cases_1992))) +
geom_point()
who_subset %>%
# calculate the log fold change between 2016 and 2006
mutate(logFoldChange = log2(cases_2002/cases_1992)) %>%
# set y axis as country ordered with respect to logFoldChange
ggplot(aes(x = logFoldChange, y = reorder(country, logFoldChange))) +
geom_point() +
# add a visual anchor at x = 0
geom_vline(xintercept=0)
who_subset %>%
mutate(logFoldChange = log2(cases_2002/cases_1992)) %>%
ggplot(aes(x = logFoldChange, y = reorder(country, logFoldChange))) +
geom_point() +
geom_vline(xintercept = 0) +
xlim(-6,6) +
# add facet_grid arranged in the column direction by region and free_y scales
facet_grid(region ~ ., scale="free_y")
amr_pertussis <- who_disease %>%
filter(region == 'AMR', year == 1980, disease == 'pertussis')
# Set x axis as country ordered with respect to cases.
ggplot(amr_pertussis, aes(x = reorder(country, cases), y = cases)) +
geom_col() +
# flip axes
coord_flip()
amr_pertussis %>%
# filter to countries that had > 0 cases.
filter(cases > 0) %>%
ggplot(aes(x = reorder(country, cases), y = cases)) +
geom_col() +
coord_flip() +
theme(panel.grid.major.y = element_blank())
amr_pertussis %>% filter(cases > 0) %>%
ggplot(aes(x = reorder(country, cases), y = cases)) +
# switch geometry to points and set point size = 2
geom_point(size=2) +
# change y-axis to log10.
scale_y_log10() +
# add theme_minimal()
theme_minimal() +
coord_flip()
Chapter 3 - Single Distributions
Importance of Distributions:
Histogram Nuances:
Kernel Density Estimates:
Example code includes:
colKeep <- c('work_zone', 'vehicle_type', 'vehicle_year', 'vehicle_color', 'race', 'gender',
'driver_state', 'speed_limit', 'speed', 'day_of_week', 'day_of_month', 'month',
'hour_of_day', 'speed_over', 'percentage_over_limit'
)
colRead <- c("Work Zone", "VehicleType", "Year", "Color", "Race", "Gender",
"DL State", "Description", "Date Of Stop", "Time Of Stop")
regFind <- ".*EXCEEDING MAXIMUM SPEED: ([0-9]+) MPH .* POSTED ([0-9]+) MPH .*"
md_speeding <- readr::read_csv("./RInputFiles/MD_Traffic/Traffic_violations.csv", n_max=200000) %>%
select(colRead) %>%
filter(grepl("EXCEEDING MAXIMUM SPEED: ", Description)) %>%
rename(work_zone="Work Zone", vehicle_type=VehicleType, vehicle_year=Year,
vehicle_color=Color, race=Race, gender=Gender, driver_state="DL State",
stopDate="Date Of Stop", stopTime="Time Of Stop") %>%
mutate(speed_limit=as.integer(gsub(regFind, "\\2", Description)),
speed=as.integer(gsub(regFind, "\\1", Description)),
speed_over=speed - speed_limit,
percentage_over_limit=100 * speed_over / speed_limit,
stopDate=as.Date(stopDate, format="%m/%d/%Y"),
day_of_week=lubridate::wday(stopDate),
day_of_month=lubridate::day(stopDate),
month=lubridate::month(stopDate),
hour_of_day=lubridate::hour(stopTime)
)
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Time Of Stop` = col_time(format = ""),
## Latitude = col_double(),
## Longitude = col_double(),
## Year = col_integer()
## )
## See spec(...) for full column specifications.
# Print data to console
glimpse(md_speeding)
## Observations: 11,935
## Variables: 18
## $ work_zone <chr> "No", "No", "No", "No", "No", "No", "No"...
## $ vehicle_type <chr> "02 - Automobile", "08 - Recreational Ve...
## $ vehicle_year <int> 2004, 2012, 1999, 2007, 2008, 2013, 2006...
## $ vehicle_color <chr> "GOLD", "GRAY", "BLACK", "BLACK", "BLACK...
## $ race <chr> "OTHER", "WHITE", "OTHER", "HISPANIC", "...
## $ gender <chr> "M", "F", "M", "F", "F", "M", "M", "M", ...
## $ driver_state <chr> "MD", "MD", "MD", "MD", "MD", "MD", "PA"...
## $ Description <chr> "EXCEEDING MAXIMUM SPEED: 49 MPH IN A PO...
## $ stopDate <date> 2013-03-20, 2013-05-27, 2012-07-26, 201...
## $ stopTime <time> 08:53:00, 14:13:00, 09:24:00, 10:40:00,...
## $ speed_limit <int> 40, 55, 35, 55, 55, 40, 55, 40, 40, 45, ...
## $ speed <int> 49, 80, 44, 92, 64, 49, 74, 49, 49, 68, ...
## $ speed_over <int> 9, 25, 9, 37, 9, 9, 19, 9, 9, 23, 19, 16...
## $ percentage_over_limit <dbl> 22.50000, 45.45455, 25.71429, 67.27273, ...
## $ day_of_week <dbl> 4, 2, 5, 7, 4, 3, 5, 1, 6, 7, 5, 1, 2, 6...
## $ day_of_month <int> 20, 27, 26, 13, 31, 1, 26, 1, 12, 26, 11...
## $ month <dbl> 3, 5, 7, 7, 7, 10, 2, 9, 7, 10, 9, 5, 3,...
## $ hour_of_day <int> 8, 14, 9, 10, 13, 17, 20, 8, 19, 16, 10,...
# Change filter to red cars
md_speeding %>%
filter(vehicle_color == 'RED') %>%
# switch x mapping to speed_over column
ggplot(aes(x = speed_over)) +
geom_histogram() +
# give plot a title
ggtitle('MPH over speed limit | Red cars')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(md_speeding) +
# Add the histogram geometry with x mapped to speed_over
geom_histogram(aes(x=speed_over), alpha=0.7) +
# Add minimal theme
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(md_speeding) +
geom_histogram(aes(x=hour_of_day, y=stat(density)), alpha=0.8)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Load md_speeding into ggplot
ggplot(md_speeding) +
# add a geom_histogram with x mapped to percentage_over_limit
geom_histogram(
aes(x=percentage_over_limit),
bins=40, # set bin number to 40
alpha=0.8) # reduce alpha to 0.8
ggplot(md_speeding) +
geom_histogram(
aes(x = percentage_over_limit),
bins = 100 , # switch to 100 bins
fill="steelblue", # set the fill of the bars to 'steelblue'
alpha = 0.8 )
ggplot(md_speeding, aes(x = hour_of_day)) +
geom_histogram(
binwidth=1, # set binwidth to 1
center=0.5, # Center bins at the half (0.5) hour
) +
scale_x_continuous(breaks = 0:24)
# filter data to just heavy duty trucks
truck_speeding <- md_speeding %>%
filter(vehicle_type == "06 - Heavy Duty Truck")
ggplot(truck_speeding, aes(x = hour_of_day)) +
# switch to density with bin width of 1.5, keep fill
geom_density(fill = 'steelblue', bw=1.5) +
# add a subtitle stating binwidth
labs(title = 'Citations by hour', subtitle="Gaussian kernel SD = 1.5")
ggplot(truck_speeding, aes(x = hour_of_day)) +
# Adjust opacity to see gridlines with alpha = 0.7
geom_density(bw = 1.5, fill = 'steelblue', alpha=0.7) +
# add a rug plot using geom_rug to see individual datapoints, set alpha to 0.5.
geom_rug(alpha=0.5) +
labs(title = 'Citations by hour', subtitle = "Gaussian kernel SD = 1.5")
ggplot(md_speeding, aes(x = percentage_over_limit)) +
# Increase bin width to 2.5
geom_density(fill = 'steelblue', bw = 2.5, alpha = 0.7) +
# lower rugplot alpha to 0.05
geom_rug(alpha = 0.05) +
labs(
title = 'Distribution of % over speed limit',
# modify subtitle to reflect change in kernel width
subtitle = "Gaussian kernel SD = 2.5"
)
Chapter 4 - Comparing Distributions
Introduction to Comparing Distributions:
Bee Swarms and Violins:
Comparing Spatially Related Distributions:
Wrap Up:
Example code includes:
md_speeding %>%
filter(vehicle_color == 'RED') %>%
# Map x and y to gender and speed columns respectively
ggplot(aes(x=gender, y=speed)) +
# add a boxplot geometry
geom_boxplot() +
# give plot supplied title
labs(title = 'Speed of red cars by gender of driver')
md_speeding %>%
filter(vehicle_color == 'RED') %>%
ggplot(aes(x = gender, y = speed)) +
# add jittered points with alpha of 0.3 and color 'steelblue'
geom_jitter(alpha=0.3, color="steelblue") +
# make boxplot transparent with alpha = 0
geom_boxplot(alpha=0) +
labs(title = 'Speed of red cars by gender of driver')
# remove color filter
md_speeding %>%
ggplot(aes(x = gender, y = speed)) +
geom_jitter(alpha = 0.3, color = 'steelblue') +
geom_boxplot(alpha = 0) +
# add a facet_wrap by vehicle_color
facet_wrap(~ vehicle_color) +
# change title to reflect new faceting
labs(title = 'Speed of different car colors, separated by gender of driver')
md_speeding %>%
filter(vehicle_color == 'RED') %>%
ggplot(aes(x = gender, y = speed)) +
# change point size to 0.5 and alpha to 0.8
ggbeeswarm::geom_beeswarm(cex=0.5, alpha=0.8) +
# add a transparent boxplot on top of points
geom_boxplot(alpha=0)
md_speeding %>%
filter(vehicle_color == 'RED') %>%
ggplot(aes(x = gender, y = speed)) +
# Replace beeswarm geometry with a violin geometry with kernel width of 2.5
geom_violin(bw = 2.5) +
# add individual points on top of violins
geom_point(alpha=0.3, size=0.5)
md_speeding %>%
filter(vehicle_color == 'RED') %>%
ggplot(aes(x = gender, y = speed)) +
geom_violin(bw = 2.5) +
# add a transparent boxplot and shrink its width to 0.3
geom_boxplot(alpha=0, width=0.3) +
# Reset point size to default and set point shape to 95
geom_point(alpha = 0.3, shape = 95) +
# Supply a subtitle detailing the kernel width
labs(subtitle = 'Gaussian kernel SD = 2.5')
md_speeding %>%
ggplot(aes(x = gender, y = speed)) +
# replace with violin plot with kernel width of 2.5, change color argument to fill
geom_violin(bw = 2.5, fill = 'steelblue') +
# reduce width to 0.3
geom_boxplot(alpha = 0, width=0.3) +
facet_wrap(~vehicle_color) +
labs(
title = 'Speed of different car colors, separated by gender of driver',
# add a subtitle w/ kernel width
subtitle = "Gaussian kernel width: 2.5"
)
## Warning in max(data$density): no non-missing arguments to max; returning -
## Inf
## Warning in max(data$density): no non-missing arguments to max; returning -
## Inf
md_speeding %>%
mutate(day_of_week = factor(day_of_week, levels=c(2, 3, 4, 5, 6, 7, 1),
labels = c("Mon","Tues","Wed","Thu","Fri","Sat","Sun")
)
) %>%
ggplot(aes( x = percentage_over_limit, y = day_of_week)) +
# Set bandwidth to 3.5
ggridges::geom_density_ridges(bandwidth=3.5) +
# add limits of 0 to 150 to x-scale
scale_x_continuous(limits=c(0, 150)) +
# provide subtitle with bandwidth
labs(subtitle='Gaussian kernel SD = 3.5')
## Warning: Removed 9 rows containing non-finite values (stat_density_ridges).
md_speeding %>%
mutate(day_of_week = factor(day_of_week, levels=c(2, 3, 4, 5, 6, 7, 1),
labels = c("Mon","Tues","Wed","Thu","Fri","Sat","Sun")
)
) %>%
ggplot(aes( x = percentage_over_limit, y = day_of_week)) +
# make ridgeline densities a bit see-through with alpha = 0.7
ggridges::geom_density_ridges(bandwidth = 3.5, alpha=0.7) +
# set expand values to c(0,0)
scale_x_continuous(limits = c(0,150), expand=c(0, 0)) +
labs(subtitle = 'Guassian kernel SD = 3.5') +
# remove y axis ticks
theme(axis.ticks.y=element_blank())
## Warning: Removed 9 rows containing non-finite values (stat_density_ridges).
md_speeding %>%
mutate(day_of_week = factor(day_of_week, levels=c(2, 3, 4, 5, 6, 7, 1),
labels = c("Mon","Tues","Wed","Thu","Fri","Sat","Sun")
)
) %>%
ggplot(aes( x = percentage_over_limit, y = day_of_week)) +
geom_point(
# make semi-transparent with alpha = 0.2
alpha=0.2,
# turn points to vertical lines with shape = '|'
shape="|",
# nudge the points downward by 0.05
position=position_nudge(y=-0.05)
) +
ggridges::geom_density_ridges(bandwidth = 3.5, alpha = 0.7) +
scale_x_continuous(limits = c(0,150), expand = c(0,0)) +
labs(subtitle = 'Guassian kernel SD = 3.5') +
theme( axis.ticks.y = element_blank() )
## Warning: Removed 9 rows containing non-finite values (stat_density_ridges).
## Warning: Removed 9 rows containing missing values (geom_point).
Chapter 1 - Introduction to Linear Algebra
Motivations:
Matrix-Vector Operations:
Matrix-Matrix Calculations:
Example code includes:
# Creating three 3's and four 4's, respectively
rep(3, 3)
## [1] 3 3 3
rep(4, 4)
## [1] 4 4 4 4
# Creating a vector with the first three even numbers and the first three odd numbers
seq(2, 6, by = 2)
## [1] 2 4 6
seq(1, 5, by = 2)
## [1] 1 3 5
# Re-creating the previous four vectors using the 'c' command
c(3, 3, 3)
## [1] 3 3 3
c(4, 4, 4, 4)
## [1] 4 4 4 4
c(2, 4, 6)
## [1] 2 4 6
c(1, 3, 5)
## [1] 1 3 5
x <- 1:7
y <- 2*x
z <- c(1, 1, 2)
# Add x to y and print
print(x + y)
## [1] 3 6 9 12 15 18 21
# Multiply z by 2 and print
print(2 * z)
## [1] 2 2 4
# Multiply x and y by each other and print
print(x * y)
## [1] 2 8 18 32 50 72 98
# Add x to z, if possible, and print
print(x + z) # should throw a warning for the recycling problem
## Warning in x + z: longer object length is not a multiple of shorter object
## length
## [1] 2 3 5 5 6 8 8
A <- matrix(1, 2, 2)
# Create a matrix of all 1's and all 2's that are 2 by 3 and 3 by 2, respectively
matrix(1, 2, 3)
## [,1] [,2] [,3]
## [1,] 1 1 1
## [2,] 1 1 1
print(matrix(2, 3, 2))
## [,1] [,2]
## [1,] 2 2
## [2,] 2 2
## [3,] 2 2
# Create a matrix and changing the byrow designation.
matrix(c(1, 2, 3, 2), nrow = 2, ncol = 2, byrow = FALSE)
## [,1] [,2]
## [1,] 1 3
## [2,] 2 2
matrix(c(1, 2, 3, 2), nrow = 2, ncol = 2, byrow = TRUE)
## [,1] [,2]
## [1,] 1 2
## [2,] 3 2
# Add A to the previously-created matrix
A + matrix(c(1, 2, 3, 2), nrow = 2, ncol = 2, byrow = TRUE)
## [,1] [,2]
## [1,] 2 3
## [2,] 4 3
A <- matrix(data=c(4, 0, 0, 1), nrow=2, ncol=2, byrow=FALSE)
b <- c(1, 1)
B <- matrix(data=c(1, 0, 0, 2/3), nrow=2, ncol=2, byrow=FALSE)
# Multiply A by b on the left
A %*% b
## [,1]
## [1,] 4
## [2,] 1
# Multiply B by b on the left
B %*% b
## [,1]
## [1,] 1.0000000
## [2,] 0.6666667
b <- c(2, 1)
A <- matrix(data=c(-1, 0, 0, 1), nrow=2, ncol=2, byrow=FALSE)
B <- matrix(data=c(1, 0, 0, -1), nrow=2, ncol=2, byrow=FALSE)
C1 <- matrix(data=c(-4, 0, 0, -2), nrow=2, ncol=2, byrow=FALSE)
# Multiply A by b on the left
A%*%b
## [,1]
## [1,] -2
## [2,] 1
# Multiplby B by b on the left
B%*%b
## [,1]
## [1,] 2
## [2,] -1
# Multiply C by b on the left
C1%*%b
## [,1]
## [1,] -8
## [2,] -2
A <- matrix(data=sqrt(2)*c(1, 1, -1, 1), nrow=2, ncol=2, byrow=FALSE)
B <- matrix(data=c(1, 0, 0, -1), nrow=2, ncol=2, byrow=FALSE)
b <- c(1, 1)
# Multply A by B on the left
A%*%B
## [,1] [,2]
## [1,] 1.414214 1.414214
## [2,] 1.414214 -1.414214
# Multiply A by B on the right
B%*%A
## [,1] [,2]
## [1,] 1.414214 -1.414214
## [2,] -1.414214 -1.414214
# Multiply b by B then A (on the left)
A%*%B%*%b
## [,1]
## [1,] 2.828427
## [2,] 0.000000
# Multiply b by A then B (on the left)
B%*%A%*%b
## [,1]
## [1,] 0.000000
## [2,] -2.828427
A <- matrix(data=c(1, -1, 2, 2), nrow=2, ncol=2, byrow=FALSE)
# Take the inverse of the 2 by 2 identity matrix
solve(diag(2))
## [,1] [,2]
## [1,] 1 0
## [2,] 0 1
# Take the inverse of the matrix A
Ainv <- solve(A)
# Multiply A by its inverse on the left
Ainv%*%A
## [,1] [,2]
## [1,] 1 0
## [2,] 0 1
# Multiply A by its inverse on the right
A%*%Ainv
## [,1] [,2]
## [1,] 1 0
## [2,] 0 1
Chapter 2 - Matrix-Vector Equations
Motivation for Solving Matrix-Vector Equations:
Matrix-Vector Equations Theory:
Solving Matrix-Vector Equations:
Other Considerations for Matrix-Vector Equations:
Example code includes:
M <- readr::read_csv("./RInputFiles/WNBA_Data_2017_M.csv")
## Parsed with column specification:
## cols(
## Atlanta = col_double(),
## Chicago = col_double(),
## Connecticut = col_double(),
## Dallas = col_double(),
## Indiana = col_double(),
## `Los Angeles` = col_double(),
## Minnesota = col_double(),
## `New York` = col_double(),
## Phoenix = col_double(),
## `San Antonio` = col_double(),
## Seattle = col_double(),
## Washington = col_double(),
## WNBA = col_double()
## )
glimpse(M)
## Observations: 13
## Variables: 13
## $ Atlanta <dbl> 33, -4, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, 1
## $ Chicago <dbl> -4, 33, -3, -3, -3, -3, -2, -3, -3, -3, -3, -3, 1
## $ Connecticut <dbl> -2, -3, 34, -3, -3, -3, -3, -4, -4, -3, -3, -3, 1
## $ Dallas <dbl> -3, -3, -3, 34, -3, -4, -3, -3, -2, -3, -3, -4, 1
## $ Indiana <dbl> -3, -3, -3, -3, 33, -3, -3, -3, -3, -3, -2, -4, 1
## $ `Los Angeles` <dbl> -3, -3, -3, -4, -3, 41, -8, -3, -6, -3, -2, -3, 1
## $ Minnesota <dbl> -3, -2, -3, -3, -3, -8, 41, -3, -4, -3, -3, -6, 1
## $ `New York` <dbl> -3, -3, -4, -3, -3, -3, -3, 34, -3, -2, -3, -4, 1
## $ Phoenix <dbl> -3, -3, -4, -2, -3, -6, -4, -3, 38, -3, -4, -3, 1
## $ `San Antonio` <dbl> -3, -3, -3, -3, -3, -3, -3, -2, -3, 32, -4, -2, 1
## $ Seattle <dbl> -3, -3, -3, -3, -2, -2, -3, -3, -4, -4, 33, -3, 1
## $ Washington <dbl> -3, -3, -3, -4, -4, -3, -6, -4, -3, -2, -3, 38, 1
## $ WNBA <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1
names(M) <- stringr::str_replace(names(M), " ", ".")
M <- M %>%
select(-WNBA) %>%
slice(-n())
M <- as.data.frame(M)
row.names(M) <- names(M)
f <- readr::read_csv("./RInputFiles/WNBA_Data_2017_f.csv")
## Parsed with column specification:
## cols(
## Differential = col_double()
## )
glimpse(f)
## Observations: 13
## Variables: 1
## $ Differential <dbl> -135, -171, 152, -104, -308, 292, 420, 83, -4, -213, -...
f <- slice(f, -n())
f <- as.data.frame(f)
row.names(f) <- names(M)
# Print the Massey Matrix M Here
print(M)
## Atlanta Chicago Connecticut Dallas Indiana Los.Angeles Minnesota
## Atlanta 33 -4 -2 -3 -3 -3 -3
## Chicago -4 33 -3 -3 -3 -3 -2
## Connecticut -2 -3 34 -3 -3 -3 -3
## Dallas -3 -3 -3 34 -3 -4 -3
## Indiana -3 -3 -3 -3 33 -3 -3
## Los.Angeles -3 -3 -3 -4 -3 41 -8
## Minnesota -3 -2 -3 -3 -3 -8 41
## New.York -3 -3 -4 -3 -3 -3 -3
## Phoenix -3 -3 -4 -2 -3 -6 -4
## San.Antonio -3 -3 -3 -3 -3 -3 -3
## Seattle -3 -3 -3 -3 -2 -2 -3
## Washington -3 -3 -3 -4 -4 -3 -6
## New.York Phoenix San.Antonio Seattle Washington
## Atlanta -3 -3 -3 -3 -3
## Chicago -3 -3 -3 -3 -3
## Connecticut -4 -4 -3 -3 -3
## Dallas -3 -2 -3 -3 -4
## Indiana -3 -3 -3 -2 -4
## Los.Angeles -3 -6 -3 -2 -3
## Minnesota -3 -4 -3 -3 -6
## New.York 34 -3 -2 -3 -4
## Phoenix -3 38 -3 -4 -3
## San.Antonio -2 -3 32 -4 -2
## Seattle -3 -4 -4 33 -3
## Washington -4 -3 -2 -3 38
# Print the vector of point differentials f here
print(f)
## Differential
## Atlanta -135
## Chicago -171
## Connecticut 152
## Dallas -104
## Indiana -308
## Los.Angeles 292
## Minnesota 420
## New.York 83
## Phoenix -4
## San.Antonio -213
## Seattle -5
## Washington -7
# Find the sum of the first column
sum(M[, 1])
## [1] 0
# Find the sum of the vector f
sum(f)
## [1] 0
M <- as.matrix(M)
# Add a row of 1's
M <- rbind(M, rep(1, 12))
# Add a column of -1's
M <- cbind(M, rep(-1, 13))
# Change the element in the lower-right corner of the matrix M
M[13, 13] <- 1
# Print M
print(M)
## Atlanta Chicago Connecticut Dallas Indiana Los.Angeles Minnesota
## Atlanta 33 -4 -2 -3 -3 -3 -3
## Chicago -4 33 -3 -3 -3 -3 -2
## Connecticut -2 -3 34 -3 -3 -3 -3
## Dallas -3 -3 -3 34 -3 -4 -3
## Indiana -3 -3 -3 -3 33 -3 -3
## Los.Angeles -3 -3 -3 -4 -3 41 -8
## Minnesota -3 -2 -3 -3 -3 -8 41
## New.York -3 -3 -4 -3 -3 -3 -3
## Phoenix -3 -3 -4 -2 -3 -6 -4
## San.Antonio -3 -3 -3 -3 -3 -3 -3
## Seattle -3 -3 -3 -3 -2 -2 -3
## Washington -3 -3 -3 -4 -4 -3 -6
## 1 1 1 1 1 1 1
## New.York Phoenix San.Antonio Seattle Washington
## Atlanta -3 -3 -3 -3 -3 -1
## Chicago -3 -3 -3 -3 -3 -1
## Connecticut -4 -4 -3 -3 -3 -1
## Dallas -3 -2 -3 -3 -4 -1
## Indiana -3 -3 -3 -2 -4 -1
## Los.Angeles -3 -6 -3 -2 -3 -1
## Minnesota -3 -4 -3 -3 -6 -1
## New.York 34 -3 -2 -3 -4 -1
## Phoenix -3 38 -3 -4 -3 -1
## San.Antonio -2 -3 32 -4 -2 -1
## Seattle -3 -4 -4 33 -3 -1
## Washington -4 -3 -2 -3 38 -1
## 1 1 1 1 1 1
#Find the inverse of M
solve(M)
## Atlanta Chicago Connecticut Dallas Indiana
## Atlanta 0.032449804 0.005402927 0.003876665 0.004630004 0.004629590
## Chicago 0.005402927 0.032446789 0.004608094 0.004626913 0.004628272
## Connecticut 0.003876665 0.004608094 0.031714805 0.004613451 0.004629714
## Dallas 0.004630004 0.004626913 0.004613451 0.031707219 0.004649172
## Indiana 0.004629590 0.004628272 0.004629714 0.004649172 0.032447936
## Los.Angeles 0.004626242 0.004554829 0.004676789 0.005214940 0.004652111
## Minnesota 0.004611109 0.003985203 0.004651940 0.004727810 0.004678479
## New.York 0.004609212 0.004627729 0.005362761 0.004647832 0.004649262
## Phoenix 0.004610546 0.004608018 0.005295038 0.004013187 0.004613089
## San.Antonio 0.004630254 0.004631081 0.004608596 0.004609009 0.004587382
## Seattle 0.004629212 0.004631185 0.004646217 0.004595132 0.003854641
## Washington 0.004627769 0.004582295 0.004649264 0.005298666 0.005313685
## -0.083333333 -0.083333333 -0.083333333 -0.083333333 -0.083333333
## Los.Angeles Minnesota New.York Phoenix San.Antonio
## Atlanta 0.004626242 0.004611109 0.004609212 0.004610546 0.004630254
## Chicago 0.004554829 0.003985203 0.004627729 0.004608018 0.004631081
## Connecticut 0.004676789 0.004651940 0.005362761 0.005295038 0.004608596
## Dallas 0.005214940 0.004727810 0.004647832 0.004013187 0.004609009
## Indiana 0.004652111 0.004678479 0.004649262 0.004613089 0.004587382
## Los.Angeles 0.027807608 0.007319076 0.004637275 0.006363490 0.004606288
## Minnesota 0.007319076 0.027810474 0.004677632 0.005388578 0.004578013
## New.York 0.004637275 0.004677632 0.031716432 0.004648253 0.003835528
## Phoenix 0.006363490 0.005388578 0.004648253 0.029212019 0.004646110
## San.Antonio 0.004606288 0.004578013 0.003835528 0.004646110 0.033267202
## Seattle 0.004032687 0.004573214 0.004607331 0.005265228 0.005427397
## Washington 0.004841998 0.006331805 0.005314087 0.004669776 0.003906474
## -0.083333333 -0.083333333 -0.083333333 -0.083333333 -0.083333333
## Seattle Washington
## Atlanta 0.004629212 0.004627769 8.333333e-02
## Chicago 0.004631185 0.004582295 8.333333e-02
## Connecticut 0.004646217 0.004649264 8.333333e-02
## Dallas 0.004595132 0.005298666 8.333333e-02
## Indiana 0.003854641 0.005313685 8.333333e-02
## Los.Angeles 0.004032687 0.004841998 8.333333e-02
## Minnesota 0.004573214 0.006331805 8.333333e-02
## New.York 0.004607331 0.005314087 8.333333e-02
## Phoenix 0.005265228 0.004669776 8.333333e-02
## San.Antonio 0.005427397 0.003906474 8.333333e-02
## Seattle 0.032485332 0.004585756 8.333333e-02
## Washington 0.004585756 0.029211757 8.333333e-02
## -0.083333333 -0.083333333 2.220446e-16
f <- as.matrix(f)
f <- rbind(f, 0)
# Solve for r and rename column
r <- solve(M)%*%f
colnames(r) <- "Rating"
# Print r
print(r)
## Rating
## Atlanta -4.012938e+00
## Chicago -5.156260e+00
## Connecticut 4.309525e+00
## Dallas -2.608129e+00
## Indiana -8.532958e+00
## Los.Angeles 7.850327e+00
## Minnesota 1.061241e+01
## New.York 2.541565e+00
## Phoenix 8.979110e-01
## San.Antonio -6.181574e+00
## Seattle -2.666953e-01
## Washington 5.468121e-01
## 1.043610e-14
# Find the rating vector using ginv
r <- MASS::ginv(M)%*%f
colnames(r) <- "Rating"
print(r)
## Rating
## [1,] -4.012938e+00
## [2,] -5.156260e+00
## [3,] 4.309525e+00
## [4,] -2.608129e+00
## [5,] -8.532958e+00
## [6,] 7.850327e+00
## [7,] 1.061241e+01
## [8,] 2.541565e+00
## [9,] 8.979110e-01
## [10,] -6.181574e+00
## [11,] -2.666953e-01
## [12,] 5.468121e-01
## [13,] 5.773160e-14
Chapter 3 - Eigenvalues and Eigenvectors
Introduction to Eigenvalues and Eigenvectors:
Definition of Eigenvalues and Eigenvectors:
Computing Eigenvalues and Eigenvectors in R:
Some more on Eigenvalues and Eigenvectors:
Example code includes:
A <- matrix(data=c(1, 0, 0, 2/3), nrow=2, ncol=2, byrow=FALSE)
# A is loaded for you
print(A%*%rep(1, 2))
## [,1]
## [1,] 1.0000000
## [2,] 0.6666667
A <- matrix(data=c(-1, 0, 0, 2, 7, 0, 4, 12, -4), nrow=3, ncol=3, byrow=FALSE)
# Show that 7 is an eigenvalue for A
A%*%c(0.2425356, 0.9701425, 0) - 7*c(0.2425356, 0.9701425, 0)
## [,1]
## [1,] 2e-07
## [2,] 0e+00
## [3,] 0e+00
# Show that -4 is an eigenvalue for A
A%*%c(-0.3789810, -0.6821657, 0.6253186) - (-4)*c(-0.3789810, -0.6821657, 0.6253186)
## [,1]
## [1,] -2.220446e-16
## [2,] 5.000000e-07
## [3,] 0.000000e+00
# Show that -1 is an eigenvalue for A
A%*%c(1, 0, 0) - (-1)*c(1, 0, 0)
## [,1]
## [1,] 0
## [2,] 0
## [3,] 0
# Show the double of the eigenvector
A%*%((2)*c(0.2425356, 0.9701425, 0)) - 7*(2)*c(0.2425356, 0.9701425, 0)
## [,1]
## [1,] 4e-07
## [2,] 0e+00
## [3,] 0e+00
# Show half of the eigenvector
A%*%((0.5)*c(0.2425356, 0.9701425, 0)) - 7*(0.5)*c(0.2425356, 0.9701425, 0)
## [,1]
## [1,] 1e-07
## [2,] 0e+00
## [3,] 0e+00
A <- matrix(data=c(1, 1, 2, 1), nrow=2, ncol=2, byrow=FALSE)
# Compute the eigenvalues of A and store in Lambda
Lambda <- eigen(A)
# Print eigenvalues
print(Lambda$values[1])
## [1] 2.414214
print(Lambda$values[2])
## [1] -0.4142136
# Verify that these numbers satisfy the conditions of being an eigenvalue
det(Lambda$values[1]*diag(2) - A)
## [1] -3.140185e-16
det(Lambda$values[2]*diag(2) - A)
## [1] -3.140185e-16
# Find the eigenvectors of A and store them in Lambda
Lambda <- eigen(A)
# Print eigenvectors
print(Lambda$vectors[, 1])
## [1] 0.8164966 0.5773503
print(Lambda$vectors[, 2])
## [1] -0.8164966 0.5773503
# Verify that these eigenvectors & their associated eigenvalues satisfy Av - lambda v = 0
Lambda$values[1]*Lambda$vectors[, 1] - A%*%Lambda$vectors[, 1]
## [,1]
## [1,] 0
## [2,] 0
Lambda$values[2]*Lambda$vectors[, 2] - A%*%Lambda$vectors[, 2]
## [,1]
## [1,] -1.110223e-16
## [2,] 8.326673e-17
Mtemp <- matrix(data=c(0.98, 0.005, 0.005, 0.01, 0.005, 0.98, 0.01, 0.005, 0.005, 0.01, 0.98, 0.005, 0.01, 0.005, 0.005, 0.98), nrow=4, ncol=4, byrow=FALSE)
Mtemp
## [,1] [,2] [,3] [,4]
## [1,] 0.980 0.005 0.005 0.010
## [2,] 0.005 0.980 0.010 0.005
## [3,] 0.005 0.010 0.980 0.005
## [4,] 0.010 0.005 0.005 0.980
# This code iterates mutation 100 times
x <- c(1, 0, 0, 0)
for (j in 1:1000) {x <- Mtemp%*%x}
# Print x
print(x)
## [,1]
## [1,] 0.25
## [2,] 0.25
## [3,] 0.25
## [4,] 0.25
# Print and scale the first eigenvector of M
Lambda <- eigen(M)
v1 <- Lambda$vectors[, 1]/sum(Lambda$vectors[, 1])
print(v1)
## [1] -4.230857e+12+0i 4.505406e+13+0i -1.387265e+13+0i 7.797797e+13+0i
## [5] 1.204798e+13+0i -6.278774e+14+0i 6.232749e+14+0i 1.751380e+13+0i
## [9] 1.575987e+14+0i -1.006767e+13+0i -5.480824e+13+0i -2.226106e+14+0i
## [13] 4.683250e-03+0i
Chapter 4 - Principal Component Analysis
Introduction to the Idea of PCA (Principal Component Analysis):
Linear Algebra Behind PCA:
Performing PCA in R:
Wrap Up:
Example code inludes:
combine <- readr::read_csv("./RInputFiles/DataCampCombine.csv")
## Parsed with column specification:
## cols(
## player = col_character(),
## position = col_character(),
## school = col_character(),
## year = col_double(),
## height = col_double(),
## weight = col_double(),
## forty = col_double(),
## vertical = col_double(),
## bench = col_double(),
## broad_jump = col_double(),
## three_cone = col_double(),
## shuttle = col_double(),
## drafted = col_character()
## )
glimpse(combine)
## Observations: 2,885
## Variables: 13
## $ player <chr> "Jaire Alexander", "Brian Allen", "Mark Andrews", "Troy ...
## $ position <chr> "CB", "C", "TE", "S", "EDGE", "DE", "WR", "CB", "ILB", "...
## $ school <chr> "Louisville", "Michigan St.", "Oklahoma", "Penn St.", "K...
## $ year <dbl> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 20...
## $ height <dbl> 71, 73, 77, 74, 76, 78, 76, 72, 73, 73, 75, 76, 80, 72, ...
## $ weight <dbl> 192, 298, 256, 198, 257, 262, 216, 185, 248, 228, 218, 2...
## $ forty <dbl> 4.38, 5.34, 4.67, 4.34, 4.87, 4.60, 4.62, 4.36, 4.59, 4....
## $ vertical <dbl> 35.0, 26.5, 31.0, 41.0, 30.0, 38.5, 34.0, 31.5, 36.0, 33...
## $ bench <dbl> 14, 27, 17, 16, 20, 18, 13, 13, 26, 15, 16, 31, 14, 14, ...
## $ broad_jump <dbl> 127, 99, 113, 131, 118, 128, 121, 119, 124, 122, 112, 10...
## $ three_cone <dbl> 6.71, 7.81, 7.34, 6.56, 7.12, 7.53, 7.07, 6.93, 6.90, 6....
## $ shuttle <dbl> 3.98, 4.71, 4.38, 4.03, 4.23, 4.48, 4.25, 4.40, 4.36, 4....
## $ drafted <chr> "Green Bay Packers / 1st / 18th pick / 2018", "Los Angel...
# Print the first few observations of the dataset
head(combine)
## # A tibble: 6 x 13
## player position school year height weight forty vertical bench broad_jump
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Jaire~ CB Louis~ 2018 71 192 4.38 35 14 127
## 2 Brian~ C Michi~ 2018 73 298 5.34 26.5 27 99
## 3 Mark ~ TE Oklah~ 2018 77 256 4.67 31 17 113
## 4 Troy ~ S Penn ~ 2018 74 198 4.34 41 16 131
## 5 Doran~ EDGE Kansas 2018 76 257 4.87 30 20 118
## 6 Ade A~ DE Tulane 2018 78 262 4.6 38.5 18 128
## # ... with 3 more variables: three_cone <dbl>, shuttle <dbl>, drafted <chr>
# Find the correlation between variables forty and three_cone
cor(combine$forty, combine$three_cone)
## [1] 0.8315171
# Find the correlation between variables vertical and broad_jump
cor(combine$vertical, combine$broad_jump)
## [1] 0.8163375
# Extract columns 5-12 of combine
A <- combine[, 5:12]
# Take the matrix of A
A <- as.matrix(A)
# Subtract the mean of all columns
A[, 1] <- A[, 1] - mean(A[, 1])
A[, 2] <- A[, 2] - mean(A[, 2])
A[, 3] <- A[, 3] - mean(A[, 3])
A[, 4] <- A[, 4] - mean(A[, 4])
A[, 5] <- A[, 5] - mean(A[, 5])
A[, 6] <- A[, 6] - mean(A[, 6])
A[, 7] <- A[, 7] - mean(A[, 7])
A[, 8] <- A[, 8] - mean(A[, 8])
# Create matrix B from equation in instructions
B <- t(A)%*%A/(nrow(A) - 1)
# Compare 1st element of B to 1st column of variance of A
B[1,1]
## [1] 7.159794
var(A[, 1])
## [1] 7.159794
# Compare 1st element of 2nd column and row element of B to 1st and 2nd columns of A
B[1, 2]
## [1] 90.78808
B[2, 1]
## [1] 90.78808
cov(A[, 1], A[, 2])
## [1] 90.78808
# Find eigenvalues of B
V <- eigen(B)
# Print eigenvalues
V$values
## [1] 2.187628e+03 4.403246e+01 2.219205e+01 5.267129e+00 2.699702e+00
## [6] 6.317016e-02 1.480866e-02 1.307283e-02
# Scale columns 5-12 of combine
B <- scale(combine[, 5:12])
# Print the first few rows of the data
head(B)
## height weight forty vertical bench broad_jump
## [1,] -1.11844839 -1.30960025 -1.3435337 0.5624657 -1.1089286 1.45502476
## [2,] -0.37100257 1.00066356 1.6449741 -1.4281627 0.9238361 -1.49512459
## [3,] 1.12388907 0.08527601 -0.4407553 -0.3743006 -0.6398290 -0.02004991
## [4,] 0.00272034 -1.17883060 -1.4680548 1.9676151 -0.7961955 1.87647467
## [5,] 0.75016616 0.10707096 0.1818505 -0.6084922 -0.1707295 0.50676247
## [6,] 1.49761199 0.21604566 -0.6586673 1.3821362 -0.4834625 1.56038724
## three_cone shuttle
## [1,] -1.38083506 -1.5879750
## [2,] 1.16888714 1.1170258
## [3,] 0.07946038 -0.1057828
## [4,] -1.72852445 -1.4027010
## [5,] -0.43048406 -0.6616049
## [6,] 0.51986694 0.2647653
# Summarize the principal component analysis
summary(prcomp(B))
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.3679 0.9228 0.78904 0.61348 0.46811 0.37178 0.34834
## Proportion of Variance 0.7009 0.1064 0.07782 0.04704 0.02739 0.01728 0.01517
## Cumulative Proportion 0.7009 0.8073 0.88514 0.93218 0.95957 0.97685 0.99202
## PC8
## Standard deviation 0.25266
## Proportion of Variance 0.00798
## Cumulative Proportion 1.00000
# Subset combine only to "WR"
combine_WR <- subset(combine, position == "WR")
# Scale columns 5-12 of combine
B <- scale(combine_WR[, 5:12])
# Print the first few rows of the data
head(B)
## height weight forty vertical bench broad_jump
## [1,] 1.4022982 0.88324903 1.20674474 -0.3430843 -0.3223377 0.07414249
## [2,] 0.5575402 -0.09700717 -0.80129388 -0.4969965 -0.7938424 -0.95388361
## [3,] 0.9799192 1.58343202 0.88968601 1.0421255 0.8564239 1.61618163
## [4,] 0.9799192 1.16332222 1.41811723 -1.5743819 -0.7938424 -1.29655897
## [5,] -1.1319757 -1.56739147 -0.80129388 -0.1891721 -0.0865854 -1.29655897
## [6,] 0.1351613 0.11304773 0.04419607 0.2725645 -1.0295947 0.24548017
## three_cone shuttle
## [1,] 0.712845019 0.02833449
## [2,] -1.098542478 0.84141123
## [3,] -1.853287268 -1.46230619
## [4,] -1.148858797 0.50262926
## [5,] 0.008416548 -0.64922946
## [6,] 0.109049187 0.84141123
# Summarize the principal component analysis
summary(prcomp(B))
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.5425 1.4255 1.0509 0.9603 0.77542 0.63867 0.59792
## Proportion of Variance 0.2974 0.2540 0.1380 0.1153 0.07516 0.05099 0.04469
## Cumulative Proportion 0.2974 0.5514 0.6894 0.8047 0.87987 0.93085 0.97554
## PC8
## Standard deviation 0.44235
## Proportion of Variance 0.02446
## Cumulative Proportion 1.00000
Chapter 1 - Introduction
Turnover:
Exploring the data:
HR data architecture:
Example code includes:
# Load the readr and dplyr packages
library(readr)
library(dplyr)
# Import the org data
org <- read_csv("org.csv")
# Check the structure of org dataset, the dplyr way
glimpse(org)
# Count Active and Inactive employees
org %>%
count(status)
# Calculate turnover rate
org %>%
summarize(avg_turnover_rate = mean(turnover))
# Calculate level wise turnover rate
df_level <- org %>%
group_by(level) %>%
summarize(turnover_level = mean(turnover))
# Check the results
df_level
# Visualize the results
ggplot(df_level, aes(x = level, y = turnover_level)) +
geom_col()
# Calculate location wise turnover rate
df_location <- org %>%
group_by(location) %>%
summarize(turnover_location = mean(turnover))
# Check the results
df_location
# Visualize the results
ggplot(df_location, aes(x = location, y = turnover_location)) +
geom_col()
# Count the number of employees across levels
org %>%
count(level)
# Select the employees at Analyst and Specialist level
org2 <- org %>%
filter(level %in% c("Analyst", "Specialist"))
# Validate the results
org2 %>%
count(level)
# View the structure of rating dataset
glimpse(rating)
# Complete the code to join rating to org2 dataset
org3 <- left_join(org2, rating, by = "emp_id")
# Calculate rating wise turnover rate
df_rating <- org3 %>%
group_by(rating) %>%
summarize(turnover_rating = mean(turnover))
# Check the result
df_rating
# View the structure of survey dataset
glimpse(survey)
# Complete the code to join survey to org3 dataset
org_final <- left_join(org3, survey, by="mgr_id")
# Compare manager effectiveness scores
ggplot(org_final, aes(x = status, y = mgr_effectiveness)) +
geom_boxplot()
# View the structure of the dataset
glimpse(org_final)
# Number of variables in the dataset
variables <- ncol(org_final)
# Compare the travel distance of Active and Inactive employees
ggplot(org_final, aes(x = status, y = distance_from_home)) +
geom_boxplot()
Chapter 2 - Feature Engineering
Feature engineering:
Compensation:
Information value:
Example code includes:
# Add age_diff
emp_age_diff <- org_final %>%
mutate(age_diff = mgr_age - emp_age)
# Plot the distribution of age difference
ggplot(emp_age_diff, aes(x = status, y = age_diff)) +
geom_boxplot()
# Add job_hop_index
emp_jhi <- emp_age_diff %>%
mutate(job_hop_index = ifelse(no_previous_companies_worked != 0, total_experience / no_previous_companies_worked, 0))
# Compare job hopping index of Active and Inactive employees
ggplot(emp_jhi, aes(x = status, y = job_hop_index)) +
geom_boxplot()
# Add tenure
emp_tenure <- emp_jhi %>%
mutate(tenure = ifelse(status == "Active",
time_length(interval(date_of_joining, cutoff_date),
"years"),
time_length(interval(date_of_joining, last_working_date),
"years")))
# Compare tenure of active and inactive employees
ggplot(emp_tenure, aes(x = status, y = tenure)) +
geom_boxplot()
# Plot the distribution of compensation
ggplot(emp_tenure, aes(x = compensation)) +
geom_histogram()
# Plot the distribution of compensation across levels
ggplot(emp_tenure,
aes(x = level, y = compensation)) +
geom_boxplot()
# Compare compensation of Active and Inactive employees across levels
ggplot(emp_tenure,
aes(x = level, y = compensation, fill = status)) +
geom_boxplot()
# Add median_compensation and compa_ratio
emp_compa_ratio <- emp_tenure %>%
group_by(level) %>%
mutate(median_compensation = median(compensation),
compa_ratio = compensation / median_compensation)
# Look at the median compensation for each level
emp_compa_ratio %>%
distinct(level, median_compensation)
# Add compa_level
emp_final <- emp_compa_ratio %>%
mutate(compa_level = ifelse(compa_ratio > 1, "Above", "Below"))
# Compare compa_level for Active & Inactive employees
ggplot(emp_final, aes(x = status, fill = compa_level)) +
geom_bar(position = "fill")
# Load Information package
library(Information)
# Compute Information Value
IV <- create_infotables(data = emp_final, y = "turnover")
# Print Information Value
IV$Summary
Chapter 3 - Predicting Turnover
Data Splitting:
Introduction to Logistic Regression:
Multicollinearity:
Building final model:
Example code includes:
# Load caret
library(caret)
# Set seed of 567
set.seed(567)
# Store row numbers for training dataset: index_train
index_train <- createDataPartition(emp_final$turnover, p = 0.7, list = FALSE)
# Create training dataset: train_set
train_set <- emp_final[index_train, ]
# Create testing dataset: test_set
test_set <- emp_final[-index_train, ]
# Calculate turnover proportion in train_set
train_set %>%
count(status) %>%
mutate(prop = n / sum(n))
# Calculate turnover proportion in test_set
test_set %>%
count(status) %>%
mutate(prop = n / sum(n))
# Build a simple logistic regression model
simple_log <- glm(turnover ~ percent_hike,
family = "binomial", data = train_set_multi)
# Print summary
summary(simple_log)
# Build a multiple logistic regression model
multi_log <- glm(turnover ~ ., family = "binomial",
data = train_set_multi)
# Print summary
summary(multi_log)
# Load the car package
library(car)
# Model you built in a previous exercise
multi_log <- glm(turnover ~ ., family = "binomial", data = train_set_multi)
# Check for multicollinearity
vif(multi_log)
# Which variable has the highest VIF?
highest <- "level"
# Remove level
model_1 <- glm(turnover ~ . - level, family = "binomial",
data = train_set_multi)
# Check multicollinearity again
vif(model_1)
# Which variable has the highest VIF value?
highest <- "compensation"
# Remove level & compensation
model_2 <- glm(turnover ~ . - level - compensation, family = "binomial",
data = train_set_multi)
# Check multicollinearity again
vif(model_2)
# Does any variable have a VIF greater than 5?
highest <- FALSE
# Build the final logistic regression model
final_log <- glm(turnover ~ ., family = "binomial", data=train_set_final)
# Print summary
summary(final_log)
# Make predictions for training dataset
prediction_train <- predict(final_log, newdata = train_set, type = "response")
# Look at the prediction range
hist(prediction_train)
# Make predictions for testing dataset
prediction_test <- predict(final_log, newdata = test_set, type = "response")
# Look at the prediction range
hist(prediction_test)
# Print the probaility of turnover
prediction_test[c(150, 200)]
Chapter 4 - Model Validation, HR Interventions, and ROI
Validating logistic regression results:
Designing retention strategy:
Return on investment:
Wrap up:
Example code includes:
# Classify predictions using a cut-off of 0.5
prediction_categories <- ifelse(prediction_test > 0.5, 1, 0)
# Construct a confusion matrix
conf_matrix <- table(prediction_categories, test_set$turnover)
conf_matrix
# Load caret
library(caret)
# Call confusionMatrix
confusionMatrix(conf_matrix)
# What is the accuracy?
accuracy <- round(unname(confusionMatrix(conf_matrix)$overall["Accuracy"]), 3)
# Load tidypredict
library(tidypredict)
# Calculate probability of turnover
emp_risk <- emp_final %>%
filter(status == "Active") %>%
tidypredict_to_column(final_log)
# Run the code
emp_risk %>%
select(emp_id, fit) %>%
top_n(2)
# Create turnover risk buckets
emp_risk_bucket <- emp_risk %>%
mutate(risk_bucket = cut(fit, breaks = c(0, 0.5, 0.6, 0.8, 1),
labels = c("no-risk", "low-risk",
"medium-risk", "high-risk")))
# Count employees in each risk bucket
emp_risk_bucket %>%
count(risk_bucket)
# Plot histogram of percent hike
ggplot(emp_final, aes(x = percent_hike)) +
geom_histogram(binwidth = 3)
# Create salary hike_range of Analyst level employees
emp_hike_range <- emp_final %>%
filter(level == "Analyst") %>%
mutate(hike_range = cut(percent_hike, breaks = c(0, 10, 15, 20),
include.lowest = TRUE,
labels = c("0 to 10", "11 to 15", "16 to 20")
)
)
# Calculate the turnover rate for each salary hike range
df_hike <- emp_hike_range %>%
group_by(hike_range) %>%
summarize(turnover_rate_hike = mean(turnover))
# Check the results
df_hike
# Visualize the results
ggplot(df_hike, aes(x = hike_range, y = turnover_rate_hike)) +
geom_col()
# Compute extra cost
extra_cost <- median_salary_analyst * (0.05)
# Compute savings
savings <- turnover_cost * 0.15
# Calculate ROI
ROI <- (savings / extra_cost) * 100
# Print ROI
cat(paste0("The return on investment is ", round(ROI), "%!"))
Chapter 1 - Why Care About Missing Data?
Introduction to Missing Data:
Why care about missing values?
How to visualize missing values?
Example code includes:
# Create x, a vector, with values NA, NaN, Inf, ".", and "missing"
x <- c(NA, NaN, Inf, ".", "missing")
# Use any_na() and are_na() on to explore the missings
naniar::any_na(x)
## [1] TRUE
naniar::are_na(x)
## [1] TRUE FALSE FALSE FALSE FALSE
dat_hw <- data.frame(weight=c(95.16, NA, 102.82, 80.98, 112.91, 94, 105.43, 77.79, NA, 98.93, 68.26, 94.16, 105.32, 61.4, 72.89, 85.67, NA, 63.3, 98.98, 72.17, NA, 103.63, 87.52, 89.78, 103.03, 97.26, 82.77, 68.27, 92.93, 74.55, 61.55, 86.09, 80.04, 88.78, 76.25, 80.44, 99.37, 84.21, NA, 88.5, 97.34, 95.35, 91.91, 78.76, NA, 101.57, 68.33, 89.75, 90.96, 87.17, 104.96, NA, 72.18, 74.09, NA, 92.65, 79.61, 110.09, 77.67, 87.46, 66.91, 76.59, 84.96, 80.21, NA, 64.15, 55.14, NA, 84.47, 100.97, NA, 83.26, 42.15, 89.25, 92.04, NA, 72.76, 69.67, 80.37, NA, 58.38, 84.34, 62.84, NA, 94.23, 83.48, 75.54, 79.93, 79.66, NA, 97.61, 77.11, 83.92, 104.56, 105.94, 107.15, 45.75, 76.61, 88.29, 93.05), height=c(1.95, 2.35, 1.64, 2.47, 1.92, 1.9, 0.83, 2.7, 1.98, 1.83, 0.24, NA, 1.67, NA, 2.03, 2.78, 0.59, 1.99, 2.34, 1.99, -0.05, 0.36, NA, 0.88, NA, 1.37, 2.62, 0.71, 0.52, -0.12, 2.25, 1.06, 1.99, 0.94, -1.11, 1.23, 1.31, 2, 1.1, 0.55, 1.84, 2.14, NA, NA, 1.94, 0.66, 0.47, 2.37, 3.4, 1.4, 2.52, 0.15, 2.42, 0.47, NA, 1.08, 1.89, 2.92, 2.71, NA, 2.72, NA, NA, 1.76, 0.73, 1.84, -0.09, 3.62, 2.34, 0.61, 2.15, 0.39, 0.92, NA, 1.41, 0, 3.51, NA, 0.18, 1.31, 1.19, 2.81, 3.32, 0.06, 3.44, NA, 1.32, NA, 2.46, 3.09, 0.13, 0.92, 0.16, 0.88, 1.38, 0.28, 2.51, NA, 1.05, 3.16))
# Use n_miss() to count the total number of missing values in dat_hw
naniar::n_miss(dat_hw)
## [1] 30
# Use n_miss() on dat_hw$weight to count the total number of missing values
naniar::n_miss(dat_hw$weight)
## [1] 15
# Use n_complete() on dat_hw to count the total number of complete values
naniar::n_complete(dat_hw)
## [1] 170
# Use n_complete() on dat_hw$weight to count the total number of complete values
naniar::n_complete(dat_hw$weight)
## [1] 85
# Use prop_miss() and prop_complete() on dat_hw to count the total number of missing values in each of the variables
naniar::prop_miss(dat_hw)
## [1] 0.15
naniar::prop_complete(dat_hw)
## [1] 0.85
data(airquality)
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
# Summarise missingness in each variable of the `airquality` dataset
naniar::miss_var_summary(airquality)
## # A tibble: 6 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 Ozone 37 24.2
## 2 Solar.R 7 4.58
## 3 Wind 0 0
## 4 Temp 0 0
## 5 Month 0 0
## 6 Day 0 0
# Summarise missingness in each case of the `airquality` dataset
naniar::miss_case_summary(airquality)
## # A tibble: 153 x 3
## case n_miss pct_miss
## <int> <int> <dbl>
## 1 5 2 33.3
## 2 27 2 33.3
## 3 6 1 16.7
## 4 10 1 16.7
## 5 11 1 16.7
## 6 25 1 16.7
## 7 26 1 16.7
## 8 32 1 16.7
## 9 33 1 16.7
## 10 34 1 16.7
## # ... with 143 more rows
# Return the summary of missingness in each variable, grouped by Month, in the `airquality` dataset
airquality %>%
group_by(Month) %>%
naniar::miss_var_summary()
## Warning: `cols` is now required.
## Please use `cols = c(data)`
## # A tibble: 25 x 4
## # Groups: Month [5]
## Month variable n_miss pct_miss
## <int> <chr> <int> <dbl>
## 1 5 Ozone 5 16.1
## 2 5 Solar.R 4 12.9
## 3 5 Wind 0 0
## 4 5 Temp 0 0
## 5 5 Day 0 0
## 6 6 Ozone 21 70
## 7 6 Solar.R 0 0
## 8 6 Wind 0 0
## 9 6 Temp 0 0
## 10 6 Day 0 0
## # ... with 15 more rows
# Return the summary of missingness in each case, grouped by Month, in the `airquality` dataset
airquality %>%
group_by(Month) %>%
naniar::miss_case_summary()
## Warning: `cols` is now required.
## Please use `cols = c(data)`
## # A tibble: 153 x 4
## # Groups: Month [5]
## Month case n_miss pct_miss
## <int> <int> <int> <dbl>
## 1 5 5 2 40
## 2 5 27 2 40
## 3 5 6 1 20
## 4 5 10 1 20
## 5 5 11 1 20
## 6 5 25 1 20
## 7 5 26 1 20
## 8 5 1 0 0
## 9 5 2 0 0
## 10 5 3 0 0
## # ... with 143 more rows
# Tabulate missingness in each variable and case of the `airquality` dataset
naniar::miss_var_table(airquality)
## # A tibble: 3 x 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 4 66.7
## 2 7 1 16.7
## 3 37 1 16.7
naniar::miss_case_table(airquality)
## # A tibble: 3 x 3
## n_miss_in_case n_cases pct_cases
## <int> <int> <dbl>
## 1 0 111 72.5
## 2 1 40 26.1
## 3 2 2 1.31
# Tabulate the missingness in each variable, grouped by Month, in the `airquality` dataset
airquality %>%
group_by(Month) %>%
naniar::miss_var_table()
## Warning: `cols` is now required.
## Please use `cols = c(data)`
## # A tibble: 12 x 4
## # Groups: Month [5]
## Month n_miss_in_var n_vars pct_vars
## <int> <int> <int> <dbl>
## 1 5 0 3 60
## 2 5 4 1 20
## 3 5 5 1 20
## 4 6 0 4 80
## 5 6 21 1 20
## 6 7 0 4 80
## 7 7 5 1 20
## 8 8 0 3 60
## 9 8 3 1 20
## 10 8 5 1 20
## 11 9 0 4 80
## 12 9 1 1 20
# Tabulate of missingness in each case, grouped by Month, in the `airquality` dataset
airquality %>%
group_by(Month) %>%
naniar::miss_case_table()
## Warning: `cols` is now required.
## Please use `cols = c(data)`
## # A tibble: 11 x 4
## # Groups: Month [5]
## Month n_miss_in_case n_cases pct_cases
## <int> <int> <int> <dbl>
## 1 5 0 24 77.4
## 2 5 1 5 16.1
## 3 5 2 2 6.45
## 4 6 0 9 30
## 5 6 1 21 70
## 6 7 0 26 83.9
## 7 7 1 5 16.1
## 8 8 0 23 74.2
## 9 8 1 8 25.8
## 10 9 0 29 96.7
## 11 9 1 1 3.33
data(pedestrian, package="naniar")
str(pedestrian)
## Classes 'tbl_df', 'tbl' and 'data.frame': 37700 obs. of 9 variables:
## $ hourly_counts: int 883 597 294 183 118 68 47 52 120 333 ...
## $ date_time : POSIXct, format: "2016-01-01 00:00:00" "2016-01-01 01:00:00" ...
## $ year : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
## $ month : Ord.factor w/ 12 levels "January"<"February"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ month_day : int 1 1 1 1 1 1 1 1 1 1 ...
## $ week_day : Ord.factor w/ 7 levels "Sunday"<"Monday"<..: 6 6 6 6 6 6 6 6 6 6 ...
## $ hour : int 0 1 2 3 4 5 6 7 8 9 ...
## $ sensor_id : int 2 2 2 2 2 2 2 2 2 2 ...
## $ sensor_name : chr "Bourke Street Mall (South)" "Bourke Street Mall (South)" "Bourke Street Mall (South)" "Bourke Street Mall (South)" ...
library(naniar)
# need to add so that the RLE can be converted to data.frame in naniar::miss_var_run
as.data.frame.rle <- function(x, ...) do.call(data.frame, x)
# Calculate the summaries for each run of missingness for the variable `hourly_counts`
naniar::miss_var_run(pedestrian, var = hourly_counts)
## # A tibble: 35 x 2
## run_length is_na
## <int> <chr>
## 1 6628 complete
## 2 1 missing
## 3 5250 complete
## 4 624 missing
## 5 3652 complete
## 6 1 missing
## 7 1290 complete
## 8 744 missing
## 9 7420 complete
## 10 1 missing
## # ... with 25 more rows
# Calculate the summaries for each span of missingness, for a span of 4000, for the variable `hourly_counts`
naniar::miss_var_span(pedestrian, var = "hourly_counts", span_every = 4000)
## # A tibble: 10 x 5
## span_counter n_miss n_complete prop_miss prop_complete
## <int> <int> <dbl> <dbl> <dbl>
## 1 1 0 4000 0 1
## 2 2 0 4000 0 1
## 3 3 0 4000 0 1
## 4 4 0 4000 0 1
## 5 5 0 4000 0 1
## 6 6 0 4000 0 1
## 7 7 0 4000 0 1
## 8 8 0 4000 0 1
## 9 9 0 4000 0 1
## 10 10 0 4000 0 1
# For each `month` variable, calculate the run of missingness for `hourly_counts`
pedestrian %>%
group_by(month) %>%
naniar::miss_var_run(var = "hourly_counts")
## Warning: `cols` is now required.
## Please use `cols = c(data)`
## # A tibble: 51 x 3
## # Groups: month [12]
## month run_length is_na
## <ord> <int> <chr>
## 1 January 2976 complete
## 2 February 2784 complete
## 3 March 2976 complete
## 4 April 888 complete
## 5 April 552 missing
## 6 April 1440 complete
## 7 May 744 complete
## 8 May 72 missing
## 9 May 2160 complete
## 10 June 2880 complete
## # ... with 41 more rows
# For each `month` variable, calculate the span of missingness of a span of 2000
pedestrian %>%
group_by(month) %>%
naniar::miss_var_span(var = "hourly_counts", span_every = 2000)
## Warning: `cols` is now required.
## Please use `cols = c(data)`
## # A tibble: 25 x 6
## # Groups: month [12]
## month span_counter n_miss n_complete prop_miss prop_complete
## <ord> <int> <int> <dbl> <dbl> <dbl>
## 1 January 1 0 2000 0 1
## 2 January 2 0 2000 0 1
## 3 February 1 0 2000 0 1
## 4 February 2 0 2000 0 1
## 5 March 1 0 2000 0 1
## 6 March 2 0 2000 0 1
## 7 April 1 0 2000 0 1
## 8 April 2 0 2000 0 1
## 9 May 1 0 2000 0 1
## 10 May 2 0 2000 0 1
## # ... with 15 more rows
data(riskfactors, package="naniar")
str(riskfactors)
## Classes 'tbl_df', 'tbl' and 'data.frame': 245 obs. of 34 variables:
## $ state : Factor w/ 52 levels "1","2","5","6",..: 22 36 52 38 28 15 40 50 14 5 ...
## $ sex : Factor w/ 2 levels "Male","Female": 2 2 2 1 2 1 1 2 1 2 ...
## $ age : int 49 48 55 42 66 66 37 62 38 42 ...
## $ weight_lbs : int 190 170 163 230 135 165 150 170 146 260 ...
## $ height_inch : int 64 68 64 74 62 70 68 70 70 73 ...
## $ bmi : num 32.7 25.9 28 29.6 24.7 ...
## $ marital : Factor w/ 6 levels "Married","Divorced",..: 1 2 1 1 3 1 1 5 1 4 ...
## $ pregnant : Factor w/ 2 levels "Yes","No": NA NA NA NA NA NA NA NA NA 2 ...
## $ children : int 0 0 0 1 0 0 3 0 2 3 ...
## $ education : Factor w/ 6 levels "1","2","3","4",..: 6 5 4 6 5 5 6 6 4 5 ...
## $ employment : Factor w/ 7 levels "1","2","3","4",..: 2 1 5 1 1 6 2 6 1 3 ...
## $ income : Factor w/ 10 levels "<10k","10-15k",..: 6 6 1 8 7 6 8 1 7 3 ...
## $ veteran : Factor w/ 5 levels "1","2","3","4",..: 5 5 5 5 5 3 5 5 5 5 ...
## $ hispanic : Factor w/ 2 levels "Yes","No": 2 2 1 2 2 2 2 2 2 2 ...
## $ health_general : Factor w/ 6 levels "Excellent","VeryGood",..: 3 4 4 1 1 1 2 5 5 3 ...
## $ health_physical : int 3 4 0 0 0 0 0 30 30 0 ...
## $ health_mental : int 15 30 0 0 0 0 0 30 30 20 ...
## $ health_poor : int 2 3 NA NA NA NA NA 30 14 4 ...
## $ health_cover : Factor w/ 2 levels "Yes","No": 1 1 1 1 1 1 1 2 1 1 ...
## $ provide_care : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 1 2 2 2 ...
## $ activity_limited: Factor w/ 2 levels "Yes","No": 1 2 2 2 2 2 2 2 1 NA ...
## $ drink_any : Factor w/ 2 levels "Yes","No": 2 2 2 1 2 2 1 2 2 NA ...
## $ drink_days : int NA NA NA 15 NA NA 2 NA NA NA ...
## $ drink_average : int NA NA NA NA NA NA 2 NA NA NA ...
## $ smoke_100 : Factor w/ 2 levels "Yes","No": 2 2 2 2 1 2 2 1 1 1 ...
## $ smoke_days : Factor w/ 3 levels "Everyday","Somedays",..: NA NA NA NA 1 NA NA 3 1 3 ...
## $ smoke_stop : Factor w/ 2 levels "Yes","No": NA NA NA NA 1 NA NA NA 1 NA ...
## $ smoke_last : Factor w/ 6 levels "3","4","5","6",..: NA NA NA NA NA NA NA 5 NA 3 ...
## $ diet_fruit : int 1095 52 36 NA -7 24 52 156 24 NA ...
## $ diet_salad : int 261 209 156 NA 261 52 156 24 84 NA ...
## $ diet_potato : int 104 52 52 NA 209 104 24 52 144 NA ...
## $ diet_carrot : int 156 0 24 NA 261 52 24 104 24 NA ...
## $ diet_vegetable : int 521 52 24 NA 365 365 730 365 0 NA ...
## $ diet_juice : int 12 0 24 NA 104 365 104 0 0 NA ...
# Visualize all of the missingness in the `riskfactors` dataset
naniar::vis_miss(riskfactors)
# Visualize and cluster all of the missingness in the `riskfactors` dataset
naniar::vis_miss(riskfactors, cluster = TRUE)
# visualise and sort the columns by missingness in the `riskfactors` dataset
naniar::vis_miss(riskfactors, sort_miss = TRUE)
# Visualize the number of missings in cases using `gg_miss_case()`
naniar::gg_miss_case(riskfactors)
# Explore the number of missings in cases using `gg_miss_case()` and facet by the variable `education`
naniar::gg_miss_case(riskfactors, facet = education)
## Warning: Factor `education` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `education` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `education` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `education` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: `cols` is now required.
## Please use `cols = c(data)`
## Warning: Factor `education` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `education` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `education` contains implicit NA, consider using
## `forcats::fct_explicit_na`
# Visualize the number of missings in variables using `gg_miss_var()`
naniar::gg_miss_var(riskfactors)
# Explore the number of missings in variables using `gg_miss_var()` and facet by the variable `education`
naniar::gg_miss_var(riskfactors, facet = education)
## Warning: Factor `education` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `education` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `education` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `education` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: `cols` is now required.
## Please use `cols = c(data)`
## Warning: Factor `education` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `education` contains implicit NA, consider using
## `forcats::fct_explicit_na`
# Using the `airquality` dataset, explore the missingness pattern using `gg_miss_upset()`
naniar::gg_miss_upset(airquality)
# With the `riskfactors` dataset, explore how the missingness changes across the `marital` using `gg_miss_fct()`
naniar::gg_miss_fct(x = riskfactors, fct = marital)
## Warning: Factor `marital` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `marital` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `marital` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `marital` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: `cols` is now required.
## Please use `cols = c(data)`
## Warning: Factor `marital` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `marital` contains implicit NA, consider using
## `forcats::fct_explicit_na`
# Using the `pedestrian` dataset Explore how the missingness changes over a span of 3000
naniar::gg_miss_span(pedestrian, var = hourly_counts, span_every = 3000)
# Using the `pedestrian` dataset: Explore the impact of `month` by facetting by `month`
# and explore how missingness changes for a span of 1000
naniar::gg_miss_span(pedestrian, var = hourly_counts, span_every = 1000, facet = month)
## Warning: `cols` is now required.
## Please use `cols = c(data)`
Chapter 2 - Wrangling and Tidying Missing Values
Search for and replace missing values:
Filling down missing values:
Missing data dependence:
Example code includes:
pacman <- tibble(year=c('2004', '1991', 'na', '1992', '1988', '2007', '2016', '2011', '2018', '2012', '1983', '1988', '1981', '1990', '1989', '1995', 'missing', 'missing', '2003', '2000', '2012', '2008', '2007', '1987', '2009', '1987', '2016', '2011', '2008', '1984', '2003', '1988', '2001', '1990', '2018', '1985', '2010', '1986', '1980', '1982', '2009', '1998', '1991', '1987', '1982', '1998', '2004', '2007', '2000', '2014', '1980', '1983', '2011', '2003', '2013', '2018', '2006', '2005', '1994', '2009', '2004', '1991', 'na', '2004', '1993', '1989', '2004', '2011', '1990', '1985', '2017', '1992', '1999', '2014', '1996', '2007', '2008', '1998', '1996', '1998', '2017', '1998', '2016', '1983', '2009', 'missing', '1993', '1989', '1994', '1980', '1983', '2004', 'missing', '1997', '1994', '2008', 'missing', '2007', '2016', '1992', '2000', '2002', '2004', '2007', '2013', '1983', '2005', '1999', '1990', '1998', '1982', '2002', 'na', '1998', '2006', '2004', '2012', '1981', '2000', '2014', '1999', '1997', '2003', '1993', '1982', '1992', '2008', '1985', '2016', '1990', '1991', '1980', '2000', 'na', '2018', 'na', '2014', '1988', 'missing', '2002', '2012', '2017', '1987', '1998', '1999', '1985', '1989', '2017', '1982', '1994', '2003', 'na', '2011', 'missing', 'missing', '1986', '2007', '2006', 'missing', '2010', '1982', '2008', '1983', '2018', '1987', '1983', 'missing', 'missing', '1998', '1988', '2010', '1981', 'na', '2016', 'na', '1992', '2001', '1995', '1999', '2009', 'na', 'na', '2003', '2017', 'na', '1982', '2005', '2013', '1990', '2004', '2004', '2006', '2009', '1984', '2007', '1987', 'na', '2001', '1983', '2012'),
month=c('6', '11', 'na', '11', '9', '12', '9', '1', '4', '9', '4', '11', '6', '7', '10', '8', 'missing', 'missing', '5', '5', '8', '1', '10', '11', '6', '7', '10', '8', '7', '1', '9', '10', '11', '7', '1', '5', '10', '6', '8', '11', '11', '8', '10', '8', '1', '9', '9', '7', '11', '11', '10', '7', '12', '9', '12', '8', '11', '4', '11', '1', '1', '9', 'na', '7', '10', '10', '3', '3', '9', '5', '8', '1', '5', '12', '6', '3', '7', '9', '12', '2', '5', '8', '4', '6', '1', 'missing', '8', '1', '12', '2', '5', '8', 'missing', '7', '2', '7', 'missing', '4', '11', '6', '5', '11', '12', '3', '3', '5', '1', '6', '12', '1', '11', '7', 'na', '9', '11', '7', '9', '10', '10', '11', '5', '11', '6', '6', '4', '10', '10', '1', '4', '6', '8', '12', '11', 'na', '11', 'na', '10', '9', 'missing', '5', '1', '5', '4', '3', '2', '11', '2', '9', '3', '3', '5', 'na', '12', 'missing', 'missing', '4', '1', '2', 'missing', '7', '2', '1', '4', '9', '4', '3', 'missing', 'missing', '1', '12', '2', '4', 'na', '3', 'na', '11', '7', '2', '5', '1', 'na', 'na', '1', '4', 'na', '10', '5', '4', '8', '2', '9', '11', '7', '10', '2', '9', 'na', '4', '6', '10'),
day=c('1', '22', 'na', '16', '16', '4', '5', '25', '14', '25', '1', '8', '17', '14', '15', '18', 'missing', 'missing', '21', '18', '21', '2', '15', '18', '22', '26', '25', '3', '24', '6', '18', '6', '4', '1', '23', '3', '10', '23', '11', '4', '11', '17', '23', '10', '8', '19', '10', '6', '24', '9', '25', '18', '7', '25', '24', '23', '17', '8', '10', '17', '8', '8', 'na', '24', '25', '7', '6', '19', '10', '13', '24', '13', '26', '4', '5', '21', '28', '15', '22', '10', '11', '15', '20', '23', '6', 'missing', '3', '2', '24', '11', '21', '21', 'missing', '24', '13', '6', 'missing', '14', '13', '17', '11', '18', '24', '9', '6', '1', '11', '21', '3', '12', '23', '27', 'na', '1', '13', '7', '17', '11', '13', '11', '20', '7', '2', '10', '9', '24', '21', '12', '25', '17', '14', '24', '18', 'na', '7', 'na', '22', '8', 'missing', '12', '14', '15', '21', '21', '1', '11', '18', '9', '18', '15', '1', 'na', '9', 'missing', 'missing', '28', '13', '12', 'missing', '3', '16', '5', '3', '19', '14', '7', 'missing', 'missing', '4', '25', '15', '4', 'na', '21', 'na', '5', '11', '10', '11', '15', 'na', 'na', '2', '10', 'na', '4', '16', '12', '11', '19', '5', '23', '24', '22', '27', '11', 'na', '23', '21', '27'),
initial=c('XGB', 'VGP', 'UAW', 'MXL', 'ZPM', 'ESF', 'YKM', 'ABS', 'NDT', 'GAS', 'IFA', 'OUH', 'PZB', 'EKR', 'TXO', 'NCV', 'XSL', 'ATM', 'LEN', 'QNE', 'CBV', 'DLU', 'LTW', 'TCV', 'BVC', 'GSP', 'LVJ', 'YQD', 'HSX', 'KNX', 'PYK', 'PVD', 'OAB', 'GHB', 'LCI', 'HMU', 'VRQ', 'WAJ', 'AIK', 'YPJ', 'BMO', 'YEH', 'YHK', 'YIA', 'TDA', 'XYF', 'LMH', 'JTO', 'ZFD', 'SXE', 'QYC', 'MPI', 'TSI', 'IVR', 'ILM', 'CME', 'FVU', 'HFJ', 'DEF', 'TCX', 'BGA', 'PBK', 'TIB', 'FYX', 'OJA', 'GEH', 'LJB', 'IHF', 'NMS', 'WSC', 'WTO', 'JBV', 'JQI', 'TCP', 'MLU', 'NBM', 'QMY', 'DLV', 'UHP', 'BGE', 'WCR', 'DNC', 'KZS', 'DBM', 'IUC', 'LRG', 'ONT', 'VKF', 'GFU', 'EQI', 'CUR', 'SAZ', 'CFU', 'SOH', 'QTM', 'CZV', 'QNR', 'LMG', 'SGR', 'DXC', 'BKI', 'CMP', 'VDR', 'CIA', 'QYW', 'CJR', 'HJQ', 'NTE', 'EGA', 'ZUY', 'AMT', 'LKP', 'HFW', 'PZQ', 'PJI', 'QJB', 'LAU', 'XYO', 'OJV', 'OBZ', 'QPV', 'LAH', 'UHW', 'XIT', 'UMB', 'OPM', 'GSC', 'PFU', 'OEC', 'ERU', 'ZWA', 'CJA', 'IGE', 'ZBQ', 'XVO', 'BWF', 'VAW', 'WDQ', 'JWT', 'QCT', 'JAH', 'WAQ', 'RCS', 'JPL', 'KCF', 'NXE', 'OPW', 'WYP', 'RMS', 'LND', 'YVO', 'XIR', 'AUW', 'OLA', 'ORF', 'ZAU', 'FXE', 'ACE', 'FQW', 'BND', 'SKA', 'BZX', 'JKY', 'IOZ', 'IYG', 'YZK', 'FOU', 'ZJT', 'XLA', 'TEZ', 'YKB', 'CYS', 'UBJ', 'DKO', 'EWZ', 'PBU', 'GEU', 'LVW', 'YWO', 'WBH', 'GXH', 'NPY', 'UIW', 'EXP', 'QAX', 'RCH', 'ZFM', 'SML', 'FNC', 'HQI', 'NQO', 'QLM', 'EGI', 'CIQ', 'ORU', 'AGP', 'MPY', 'EFL', 'VXR', 'QYE'),
score=c('892369', '2412494', '1874449', '1583331', '3159043', '2755582', '804088', '2392395', '431430', '1482088', '3099396', '810873', '2410285', '1602619', 'N/A', '1547264', '1086746', '885575', '2464437', '333868', '2991881', '1207552', '332352', '115716', 'N/A', '2551711', '679715', '3033343', '275723', '1677698', '1031285', '3251416', '1812998', '1767317', '2457197', '2194699', '1258734', '535437', '3202731', '899729', '1099688', '2125942', '2407498', '1785754', '2181741', '1058088', '1630900', '1629161', '2378243', '3211114', '65436', '2006229', '2068916', '1653110', '2589346', '1520554', '374610', 'N/A', '2841676', '1001739', '438268', '2476918', '2584965', '702929', '189630', 'N/A', '410549', '1269273', '2658430', '1760979', 'N/A', '2705304', '1560004', '826721', '3291811', '2366950', '832279', '426785', '2898752', '1369821', '2712315', '2123280', '2513951', '1004901', '645429', '846193', '313628', '1791507', '2612127', '836682', '1955459', '1866444', '75834', '532534', '3267355', '235734', '2279669', '2976729', '2297788', '1166581', '15715', '890432', '1670356', '1463904', '2867923', '1761345', '2667484', '2357424', '3053758', '2077402', '1052647', '1661650', '123930', '3171836', '1910536', '2100782', '679137', '1424599', '2194459', '1263044', '1948854', 'N/A', '3092624', '2077243', '1010777', '3289300', '3172553', '891045', '1592747', '728752', 'N/A', '24667', 'N/A', '827488', '1643701', '2844488', '539713', '3160321', '762261', '2505569', '271322', '1479487', '1217212', '2960042', '1825455', '1287888', '2105751', '450550', '894755', '3115431', '781721', '3220718', '767717', '3204211', '1666549', '3128098', '2445271', '1571440', '2088915', '645360', '2321491', '1135310', '1736847', '2378391', '3097570', '1220994', '165122', '2007635', '876910', '1551229', '1357429', '2168680', '1411345', '3290465', '1860365', '3181429', '2872190', '2780599', '2160057', '60716', '2222480', '22113', '2815280', 'N/A', '2517561', '500742', '3077608', '1481553', '1349499', '2539062', '2057675', '2869686', '863857', '2609949', '2337505', '76444', '3062706', '3031438', '759570', '1741154'),
country=c(' ', 'US', ' ', ' ', ' ', 'US', 'NZ', 'CA', 'GB', 'CN', 'ES', 'US', 'NZ', 'AU', 'CN', 'US', 'CA', 'US', 'US', 'CN', 'AU', 'ES', 'NZ', 'CA', 'CN', 'ES', 'NZ', 'NZ', 'CN', 'GB', 'CN', 'US', 'ES', 'CN', 'US', 'CN', 'AU', 'GB', 'ES', 'AT', ' ', 'US', 'NZ', 'AU', ' ', 'US', 'US', 'US', 'ES', 'NZ', 'AT', 'NZ', 'JP', 'ES', 'NZ', 'NZ', 'GB', 'CN', 'AU', 'GB', 'ES', 'GB', 'AT', 'NZ', 'CN', 'US', 'AU', 'GB', 'US', 'JP', 'CA', 'AT', 'AT', 'CN', 'AU', 'JP', 'CA', 'GB', 'AT', 'AU', 'GB', 'CN', 'AU', 'GB', 'AT', 'NZ', 'JP', 'GB', ' ', 'CN', 'US', 'JP', 'CN', 'GB', 'GB', 'GB', 'AT', 'US', 'GB', 'GB', 'JP', 'CN', 'AU', 'AU', 'AT', 'JP', 'US', 'JP', 'NZ', 'JP', 'AT', 'NZ', 'CA', 'CA', 'GB', 'ES', 'ES', 'GB', 'ES', 'GB', 'AU', 'GB', 'AT', 'CN', 'AT', 'ES', ' ', 'CA', 'CA', 'GB', 'AU', 'CN', 'ES', 'NZ', 'CA', 'JP', 'JP', 'NZ', 'GB', 'CA', 'ES', 'AT', 'AU', 'CA', 'CN', 'US', 'JP', 'AT', 'CA', 'JP', ' ', 'GB', 'GB', 'NZ', 'AU', 'JP', 'US', 'US', 'AU', 'US', 'AT', 'GB', 'GB', 'GB', 'AT', 'CN', 'ES', 'US', 'JP', 'GB', 'AT', 'JP', 'AU', 'NZ', 'GB', 'GB', 'ES', 'ES', 'AT', 'GB', 'CN', ' ', 'US', 'JP', 'AT', 'US', 'CA', 'AT', 'US', 'GB', 'US', 'ES', 'US', ' ', 'ES', 'JP', 'CA', 'AU', 'CA', 'US') )
pacman
## # A tibble: 200 x 6
## year month day initial score country
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2004 6 1 XGB 892369 " "
## 2 1991 11 22 VGP 2412494 "US"
## 3 na na na UAW 1874449 " "
## 4 1992 11 16 MXL 1583331 " "
## 5 1988 9 16 ZPM 3159043 " "
## 6 2007 12 4 ESF 2755582 "US"
## 7 2016 9 5 YKM 804088 "NZ"
## 8 2011 1 25 ABS 2392395 "CA"
## 9 2018 4 14 NDT 431430 "GB"
## 10 2012 9 25 GAS 1482088 "CN"
## # ... with 190 more rows
# Explore all of the strange missing values, "N/A", "missing", " ", "na"
naniar::miss_scan_count(data = pacman, search = list("N/A", "missing", " ", "na"))
## # A tibble: 6 x 2
## Variable n
## <chr> <int>
## 1 year 23
## 2 month 23
## 3 day 23
## 4 initial 0
## 5 score 9
## 6 country 11
# Print the top of the pacman data using `head()`
head(pacman)
## # A tibble: 6 x 6
## year month day initial score country
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2004 6 1 XGB 892369 " "
## 2 1991 11 22 VGP 2412494 "US"
## 3 na na na UAW 1874449 " "
## 4 1992 11 16 MXL 1583331 " "
## 5 1988 9 16 ZPM 3159043 " "
## 6 2007 12 4 ESF 2755582 "US"
# Replace the strange missing values "N/A" and "missing" with `NA`
pacman_clean <- naniar::replace_with_na(pacman, replace = list(year = c("N/A", "na", "missing"),
score = c("N/A", "na", "missing")
)
)
# Test if `pacman_clean` still has these values in it?
naniar::miss_scan_count(pacman_clean, search = list("N/A", "na", "missing"))
## # A tibble: 6 x 2
## Variable n
## <chr> <int>
## 1 year 0
## 2 month 23
## 3 day 23
## 4 initial 0
## 5 score 0
## 6 country 0
# Use `replace_with_na_at()` to replace with NA
naniar::replace_with_na_at(pacman, .vars = c("year", "month", "day"),
~.x %in% c("N/A", "missing", "na", " ")
)
## # A tibble: 200 x 6
## year month day initial score country
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2004 6 1 XGB 892369 " "
## 2 1991 11 22 VGP 2412494 "US"
## 3 <NA> <NA> <NA> UAW 1874449 " "
## 4 1992 11 16 MXL 1583331 " "
## 5 1988 9 16 ZPM 3159043 " "
## 6 2007 12 4 ESF 2755582 "US"
## 7 2016 9 5 YKM 804088 "NZ"
## 8 2011 1 25 ABS 2392395 "CA"
## 9 2018 4 14 NDT 431430 "GB"
## 10 2012 9 25 GAS 1482088 "CN"
## # ... with 190 more rows
# Use `replace_with_na_if()` to replace with NA
naniar::replace_with_na_if(pacman, .predicate = is.character,
~.x %in% c("N/A", "missing", "na")
)
## # A tibble: 200 x 6
## year month day initial score country
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2004 6 1 XGB 892369 " "
## 2 1991 11 22 VGP 2412494 "US"
## 3 <NA> <NA> <NA> UAW 1874449 " "
## 4 1992 11 16 MXL 1583331 " "
## 5 1988 9 16 ZPM 3159043 " "
## 6 2007 12 4 ESF 2755582 "US"
## 7 2016 9 5 YKM 804088 "NZ"
## 8 2011 1 25 ABS 2392395 "CA"
## 9 2018 4 14 NDT 431430 "GB"
## 10 2012 9 25 GAS 1482088 "CN"
## # ... with 190 more rows
# Use `replace_with_na_all()` to replace with NA
naniar::replace_with_na_all(pacman, ~.x %in% c("N/A", "missing", "na"))
## # A tibble: 200 x 6
## year month day initial score country
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2004 6 1 XGB 892369 " "
## 2 1991 11 22 VGP 2412494 "US"
## 3 <NA> <NA> <NA> UAW 1874449 " "
## 4 1992 11 16 MXL 1583331 " "
## 5 1988 9 16 ZPM 3159043 " "
## 6 2007 12 4 ESF 2755582 "US"
## 7 2016 9 5 YKM 804088 "NZ"
## 8 2011 1 25 ABS 2392395 "CA"
## 9 2018 4 14 NDT 431430 "GB"
## 10 2012 9 25 GAS 1482088 "CN"
## # ... with 190 more rows
frogger <- tibble(name=factor(c('jesse', 'jesse', 'jesse', 'jesse', 'andy', 'andy', 'andy', 'nic', 'nic', 'dan', 'dan', 'alex', 'alex', 'alex', 'alex')),
time=factor(c('morning', 'afternoon', 'evening', 'late_night', 'morning', 'afternoon', 'late_night', 'afternoon', 'late_night', 'morning', 'evening', 'morning', 'afternoon', 'evening', 'late_night')),
value=as.integer(c(6678, 800060, 475528, 143533, 425115, 587468, 111000, 588532, 915533, 388148, 180912, 552670, 98355, 266055, 121056))
)
str(frogger)
## Classes 'tbl_df', 'tbl' and 'data.frame': 15 obs. of 3 variables:
## $ name : Factor w/ 5 levels "alex","andy",..: 4 4 4 4 2 2 2 5 5 3 ...
## $ time : Factor w/ 4 levels "afternoon","evening",..: 4 1 2 3 4 1 3 1 3 4 ...
## $ value: int 6678 800060 475528 143533 425115 587468 111000 588532 915533 388148 ...
# Use `complete()` on the `time` variable to make implicit missing values explicit
frogger
## # A tibble: 15 x 3
## name time value
## <fct> <fct> <int>
## 1 jesse morning 6678
## 2 jesse afternoon 800060
## 3 jesse evening 475528
## 4 jesse late_night 143533
## 5 andy morning 425115
## 6 andy afternoon 587468
## 7 andy late_night 111000
## 8 nic afternoon 588532
## 9 nic late_night 915533
## 10 dan morning 388148
## 11 dan evening 180912
## 12 alex morning 552670
## 13 alex afternoon 98355
## 14 alex evening 266055
## 15 alex late_night 121056
frogger_tidy <- frogger %>%
complete(name, time)
frogger_tidy
## # A tibble: 20 x 3
## name time value
## <fct> <fct> <int>
## 1 alex afternoon 98355
## 2 alex evening 266055
## 3 alex late_night 121056
## 4 alex morning 552670
## 5 andy afternoon 587468
## 6 andy evening NA
## 7 andy late_night 111000
## 8 andy morning 425115
## 9 dan afternoon NA
## 10 dan evening 180912
## 11 dan late_night NA
## 12 dan morning 388148
## 13 jesse afternoon 800060
## 14 jesse evening 475528
## 15 jesse late_night 143533
## 16 jesse morning 6678
## 17 nic afternoon 588532
## 18 nic evening NA
## 19 nic late_night 915533
## 20 nic morning NA
# Use `fill()` to fill down the name variable in the frogger dataset
frogger
## # A tibble: 15 x 3
## name time value
## <fct> <fct> <int>
## 1 jesse morning 6678
## 2 jesse afternoon 800060
## 3 jesse evening 475528
## 4 jesse late_night 143533
## 5 andy morning 425115
## 6 andy afternoon 587468
## 7 andy late_night 111000
## 8 nic afternoon 588532
## 9 nic late_night 915533
## 10 dan morning 388148
## 11 dan evening 180912
## 12 alex morning 552670
## 13 alex afternoon 98355
## 14 alex evening 266055
## 15 alex late_night 121056
frogger %>%
fill(name)
## # A tibble: 15 x 3
## name time value
## <fct> <fct> <int>
## 1 jesse morning 6678
## 2 jesse afternoon 800060
## 3 jesse evening 475528
## 4 jesse late_night 143533
## 5 andy morning 425115
## 6 andy afternoon 587468
## 7 andy late_night 111000
## 8 nic afternoon 588532
## 9 nic late_night 915533
## 10 dan morning 388148
## 11 dan evening 180912
## 12 alex morning 552670
## 13 alex afternoon 98355
## 14 alex evening 266055
## 15 alex late_night 121056
# Correctly fill() and complete() missing values so that our dataset becomes sensible
frogger
## # A tibble: 15 x 3
## name time value
## <fct> <fct> <int>
## 1 jesse morning 6678
## 2 jesse afternoon 800060
## 3 jesse evening 475528
## 4 jesse late_night 143533
## 5 andy morning 425115
## 6 andy afternoon 587468
## 7 andy late_night 111000
## 8 nic afternoon 588532
## 9 nic late_night 915533
## 10 dan morning 388148
## 11 dan evening 180912
## 12 alex morning 552670
## 13 alex afternoon 98355
## 14 alex evening 266055
## 15 alex late_night 121056
frogger %>%
fill(name) %>%
complete(name, time)
## # A tibble: 20 x 3
## name time value
## <fct> <fct> <int>
## 1 alex afternoon 98355
## 2 alex evening 266055
## 3 alex late_night 121056
## 4 alex morning 552670
## 5 andy afternoon 587468
## 6 andy evening NA
## 7 andy late_night 111000
## 8 andy morning 425115
## 9 dan afternoon NA
## 10 dan evening 180912
## 11 dan late_night NA
## 12 dan morning 388148
## 13 jesse afternoon 800060
## 14 jesse evening 475528
## 15 jesse late_night 143533
## 16 jesse morning 6678
## 17 nic afternoon 588532
## 18 nic evening NA
## 19 nic late_night 915533
## 20 nic morning NA
data("oceanbuoys", package="naniar")
str(oceanbuoys)
## Classes 'tbl_df', 'tbl' and 'data.frame': 736 obs. of 8 variables:
## $ year : num 1997 1997 1997 1997 1997 ...
## $ latitude : num 0 0 0 0 0 0 0 0 0 0 ...
## $ longitude : num -110 -110 -110 -110 -110 -110 -110 -110 -110 -110 ...
## $ sea_temp_c: num 27.6 27.5 27.6 27.6 27.6 ...
## $ air_temp_c: num 27.1 27 27 26.9 26.8 ...
## $ humidity : num 79.6 75.8 76.5 76.2 76.4 ...
## $ wind_ew : num -6.4 -5.3 -5.1 -4.9 -3.5 ...
## $ wind_ns : num 5.4 5.3 4.5 2.5 4.1 ...
# Arrange by year
oceanbuoys %>%
arrange(year) %>%
naniar::vis_miss()
# Arrange by latitude
oceanbuoys %>%
arrange(latitude) %>%
naniar::vis_miss()
# Arrange by wind_ew (wind east west)
oceanbuoys %>%
arrange(wind_ew) %>%
naniar::vis_miss()
Chapter 3 - Testing Missing Relationships
Tools to explore missing data dependence:
Visualizing missingness across one variable:
Visualizing misingness across two variables:
Example code includes:
# Create shadow matrix data with `as_shadow()`
naniar::as_shadow(oceanbuoys)
## # A tibble: 736 x 8
## year_NA latitude_NA longitude_NA sea_temp_c_NA air_temp_c_NA humidity_NA
## <fct> <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA !NA !NA !NA !NA
## 2 !NA !NA !NA !NA !NA !NA
## 3 !NA !NA !NA !NA !NA !NA
## 4 !NA !NA !NA !NA !NA !NA
## 5 !NA !NA !NA !NA !NA !NA
## 6 !NA !NA !NA !NA !NA !NA
## 7 !NA !NA !NA !NA !NA !NA
## 8 !NA !NA !NA !NA !NA !NA
## 9 !NA !NA !NA !NA !NA !NA
## 10 !NA !NA !NA !NA !NA !NA
## # ... with 726 more rows, and 2 more variables: wind_ew_NA <fct>,
## # wind_ns_NA <fct>
# Create nabular data by binding the shadow to the data with `bind_shadow()`
naniar::bind_shadow(oceanbuoys)
## # A tibble: 736 x 16
## year latitude longitude sea_temp_c air_temp_c humidity wind_ew wind_ns
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1997 0 -110 27.6 27.1 79.6 -6.40 5.40
## 2 1997 0 -110 27.5 27.0 75.8 -5.30 5.30
## 3 1997 0 -110 27.6 27 76.5 -5.10 4.5
## 4 1997 0 -110 27.6 26.9 76.2 -4.90 2.5
## 5 1997 0 -110 27.6 26.8 76.4 -3.5 4.10
## 6 1997 0 -110 27.8 26.9 76.7 -4.40 1.60
## 7 1997 0 -110 28.0 27.0 76.5 -2 3.5
## 8 1997 0 -110 28.0 27.1 78.3 -3.70 4.5
## 9 1997 0 -110 28.0 27.2 78.6 -4.20 5
## 10 1997 0 -110 28.0 27.2 76.9 -3.60 3.5
## # ... with 726 more rows, and 8 more variables: year_NA <fct>,
## # latitude_NA <fct>, longitude_NA <fct>, sea_temp_c_NA <fct>,
## # air_temp_c_NA <fct>, humidity_NA <fct>, wind_ew_NA <fct>, wind_ns_NA <fct>
# Bind only the variables with missing values by using bind_shadow(only_miss = TRUE)
naniar::bind_shadow(oceanbuoys, only_miss = TRUE)
## # A tibble: 736 x 11
## year latitude longitude sea_temp_c air_temp_c humidity wind_ew wind_ns
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1997 0 -110 27.6 27.1 79.6 -6.40 5.40
## 2 1997 0 -110 27.5 27.0 75.8 -5.30 5.30
## 3 1997 0 -110 27.6 27 76.5 -5.10 4.5
## 4 1997 0 -110 27.6 26.9 76.2 -4.90 2.5
## 5 1997 0 -110 27.6 26.8 76.4 -3.5 4.10
## 6 1997 0 -110 27.8 26.9 76.7 -4.40 1.60
## 7 1997 0 -110 28.0 27.0 76.5 -2 3.5
## 8 1997 0 -110 28.0 27.1 78.3 -3.70 4.5
## 9 1997 0 -110 28.0 27.2 78.6 -4.20 5
## 10 1997 0 -110 28.0 27.2 76.9 -3.60 3.5
## # ... with 726 more rows, and 3 more variables: sea_temp_c_NA <fct>,
## # air_temp_c_NA <fct>, humidity_NA <fct>
# `bind_shadow()` and `group_by()` humidity missingness (`humidity_NA`)
oceanbuoys %>%
naniar::bind_shadow() %>%
group_by(humidity_NA) %>%
summarise(wind_ew_mean = mean(wind_ew),
wind_ew_sd = sd(wind_ew)
)
## # A tibble: 2 x 3
## humidity_NA wind_ew_mean wind_ew_sd
## <fct> <dbl> <dbl>
## 1 !NA -3.78 1.90
## 2 NA -3.30 2.31
# Repeat this, but calculating summaries for wind north south (`wind_ns`).
oceanbuoys %>%
naniar::bind_shadow() %>%
group_by(humidity_NA) %>%
summarise(wind_ns_mean = mean(wind_ns),
wind_ns_sd = sd(wind_ns)
)
## # A tibble: 2 x 3
## humidity_NA wind_ns_mean wind_ns_sd
## <fct> <dbl> <dbl>
## 1 !NA 2.78 2.06
## 2 NA 1.66 2.23
# Summarise wind_ew by the missingness of `air_temp_c_NA`
oceanbuoys %>%
naniar::bind_shadow() %>%
group_by(air_temp_c_NA) %>%
summarise(wind_ew_mean = mean(wind_ew),
wind_ew_sd = sd(wind_ew),
n_obs = n()
)
## # A tibble: 2 x 4
## air_temp_c_NA wind_ew_mean wind_ew_sd n_obs
## <fct> <dbl> <dbl> <int>
## 1 !NA -3.91 1.85 655
## 2 NA -2.17 2.14 81
# Summarise wind_ew by missingness of `air_temp_c_NA` and `humidity_NA`
oceanbuoys %>%
naniar::bind_shadow() %>%
group_by(air_temp_c_NA, humidity_NA) %>%
summarise(wind_ew_mean = mean(wind_ew),
wind_ew_sd = sd(wind_ew),
n_obs = n()
)
## # A tibble: 4 x 5
## # Groups: air_temp_c_NA [2]
## air_temp_c_NA humidity_NA wind_ew_mean wind_ew_sd n_obs
## <fct> <fct> <dbl> <dbl> <int>
## 1 !NA !NA -4.01 1.74 565
## 2 !NA NA -3.24 2.31 90
## 3 NA !NA -2.06 2.08 78
## 4 NA NA -4.97 1.74 3
# First explore the missingness structure of `oceanbuoys` using `vis_miss()`
naniar::vis_miss(oceanbuoys)
# Explore the distribution of `wind_ew` for the missingness of `air_temp_c_NA` using `geom_density()`
naniar::bind_shadow(oceanbuoys) %>%
ggplot(aes(x = wind_ew, color = air_temp_c_NA)) +
geom_density()
# Explore the distribution of sea temperature for the missingness of humidity (humidity_NA) using `geom_density()`
naniar::bind_shadow(oceanbuoys) %>%
ggplot(aes(x = sea_temp_c, color = humidity_NA)) +
geom_density()
## Warning: Removed 3 rows containing non-finite values (stat_density).
# Explore the distribution of wind east west (`wind_ew`) for the missingness of air temperature using `geom_density()` and facetting by the missingness of air temperature (`air_temp_c_NA`).
oceanbuoys %>%
naniar::bind_shadow() %>%
ggplot(aes(x = wind_ew)) +
geom_density() +
facet_wrap(~air_temp_c_NA)
# Build upon this visualisation by coloring by the missingness of humidity (`humidity_NA`).
oceanbuoys %>%
naniar::bind_shadow() %>%
ggplot(aes(x = wind_ew, color = humidity_NA)) +
geom_density() +
facet_wrap(~air_temp_c_NA)
# Explore the distribution of wind east west (`wind_ew`) for the missingness of air temperature using `geom_boxplot()`
oceanbuoys %>%
naniar::bind_shadow() %>%
ggplot(aes(x = air_temp_c_NA, y = wind_ew)) +
geom_boxplot()
# Build upon this visualisation by facetting by the missingness of humidity (`humidity_NA`).
oceanbuoys %>%
naniar::bind_shadow() %>%
ggplot(aes(x = air_temp_c_NA, y = wind_ew)) +
geom_boxplot() +
facet_wrap(~humidity_NA)
# Explore the missingness in wind and air temperature, and display the missingness using `geom_miss_point()`
ggplot(oceanbuoys, aes(x = wind_ew, y = air_temp_c)) +
naniar::geom_miss_point()
# Explore the missingness in humidity and air temperature, and display the missingness using `geom_miss_point()`
ggplot(oceanbuoys, aes(x = humidity, y = air_temp_c)) +
naniar::geom_miss_point()
# Explore the missingness in wind and air temperature, and display the missingness using `geom_miss_point()`. Facet by year to explore this further.
ggplot(oceanbuoys, aes(x = wind_ew, y = air_temp_c)) +
naniar::geom_miss_point() +
facet_wrap(~year)
# Explore the missingness in humidity and air temperature, and display the missingness using `geom_miss_point()` Facet by year to explore this further.
ggplot(oceanbuoys, aes(x=humidity, y=air_temp_c)) +
naniar::geom_miss_point() +
facet_wrap(~year)
# Use geom_miss_point() and facet_wrap to explore how the missingness in wind_ew and air_temp_c is different for missingness of humidity
naniar::bind_shadow(oceanbuoys) %>%
ggplot(aes(x = wind_ew, y = air_temp_c)) +
naniar::geom_miss_point() +
facet_wrap(~humidity_NA)
# Use geom_miss_point() and facet_grid to explore how the missingness in wind_ew and air_temp_c is different for missingness of humidity AND by year - by using `facet_grid(humidity_NA ~ year)`
naniar::bind_shadow(oceanbuoys) %>%
ggplot(aes(x = wind_ew, y = air_temp_c)) +
naniar::geom_miss_point() +
facet_grid(humidity_NA~year)
Chapter 4 - Imputation
Filling in the blanks:
What makes a good imputation?
Performing imputations:
Evaluating imputations and models:
Example code includes:
# Impute the oceanbuoys data below the range using `impute_below`.
ocean_imp <- naniar::impute_below_all(oceanbuoys)
# Visualise the new missing values
ggplot(ocean_imp, aes(x = wind_ew, y = air_temp_c)) +
geom_point()
# Impute and track data with `bind_shadow`, `impute_below_all`, and `add_label_shadow`
ocean_imp_track <- naniar::bind_shadow(oceanbuoys) %>%
naniar::impute_below_all() %>%
naniar::add_label_shadow()
# Look at the imputed values
ocean_imp_track
## # A tibble: 736 x 17
## year latitude longitude sea_temp_c air_temp_c humidity wind_ew wind_ns
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1997 0 -110 27.6 27.1 79.6 -6.40 5.40
## 2 1997 0 -110 27.5 27.0 75.8 -5.30 5.30
## 3 1997 0 -110 27.6 27 76.5 -5.10 4.5
## 4 1997 0 -110 27.6 26.9 76.2 -4.90 2.5
## 5 1997 0 -110 27.6 26.8 76.4 -3.5 4.10
## 6 1997 0 -110 27.8 26.9 76.7 -4.40 1.60
## 7 1997 0 -110 28.0 27.0 76.5 -2 3.5
## 8 1997 0 -110 28.0 27.1 78.3 -3.70 4.5
## 9 1997 0 -110 28.0 27.2 78.6 -4.20 5
## 10 1997 0 -110 28.0 27.2 76.9 -3.60 3.5
## # ... with 726 more rows, and 9 more variables: year_NA <fct>,
## # latitude_NA <fct>, longitude_NA <fct>, sea_temp_c_NA <fct>,
## # air_temp_c_NA <fct>, humidity_NA <fct>, wind_ew_NA <fct>, wind_ns_NA <fct>,
## # any_missing <chr>
ggplot(ocean_imp_track, aes(x=wind_ew, y=air_temp_c, colour=any_missing)) +
geom_point()
# Visualise the missingness in wind and air temperature, coloring missing air temp values with air_temp_c_NA
ggplot(ocean_imp_track, aes(x = wind_ew, y = air_temp_c, color = air_temp_c_NA)) +
geom_point()
# Visualise humidity and air temp, coloring any missing cases using the variable any_missing
ggplot(ocean_imp_track, aes(x = humidity, y = air_temp_c, color = any_missing)) +
geom_point()
# Explore the values of air_temp_c, visualising the amount of missings with `air_temp_c_NA`.
p <- ggplot(ocean_imp_track, aes(x = air_temp_c, fill = air_temp_c_NA)) +
geom_histogram()
# Expore the missings in humidity using humidity_NA
p2 <- ggplot(ocean_imp_track, aes(x = humidity, fill = humidity_NA)) +
geom_histogram()
# Explore the missings in air_temp_c according to year, using `facet_wrap(~year)`.
p + facet_wrap(~year)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Explore the missings in humidity according to year, using `facet_wrap(~year)`.
p2 + facet_wrap(~year)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Impute the mean value and track the imputations
ocean_imp_mean <- naniar::bind_shadow(oceanbuoys) %>%
naniar::impute_mean_all() %>%
naniar::add_label_shadow()
# Explore the mean values in humidity in the imputed dataset
ggplot(ocean_imp_mean, aes(x = humidity_NA, y = humidity)) +
geom_boxplot()
# Explore the values in air temperature in the imputed dataset
ggplot(ocean_imp_mean, aes(x = air_temp_c_NA, y = air_temp_c)) +
geom_boxplot()
# Explore imputations in air temperature and humidity, coloring by the variable, any_missing
ggplot(ocean_imp_mean, aes(x = air_temp_c, y = humidity, color = any_missing)) +
geom_point()
# Explore imputations in air temperature and humidity, coloring by the variable, any_missing, and faceting by year
ggplot(ocean_imp_mean, aes(x = air_temp_c, y = humidity, color = any_missing)) +
geom_point() +
facet_wrap(~year)
# Gather the imputed data
ocean_imp_mean_gather <- naniar::shadow_long(ocean_imp_mean, humidity, air_temp_c)
# Inspect the data
ocean_imp_mean_gather
## # A tibble: 1,472 x 4
## variable value variable_NA value_NA
## <chr> <chr> <chr> <chr>
## 1 air_temp_c 27.14999962 air_temp_c_NA !NA
## 2 air_temp_c 27.02000046 air_temp_c_NA !NA
## 3 air_temp_c 27 air_temp_c_NA !NA
## 4 air_temp_c 26.93000031 air_temp_c_NA !NA
## 5 air_temp_c 26.84000015 air_temp_c_NA !NA
## 6 air_temp_c 26.94000053 air_temp_c_NA !NA
## 7 air_temp_c 27.04000092 air_temp_c_NA !NA
## 8 air_temp_c 27.11000061 air_temp_c_NA !NA
## 9 air_temp_c 27.20999908 air_temp_c_NA !NA
## 10 air_temp_c 27.25 air_temp_c_NA !NA
## # ... with 1,462 more rows
# Explore the imputations in a histogram
ggplot(ocean_imp_mean_gather, aes(x = as.numeric(value), fill = value_NA)) +
geom_histogram() +
facet_wrap(~variable)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Impute humidity and air temperature using wind_ew and wind_ns, and track missing values
ocean_imp_lm_wind <- oceanbuoys %>%
naniar::bind_shadow() %>%
simputation::impute_lm(air_temp_c ~ wind_ew + wind_ns) %>%
simputation::impute_lm(humidity ~ wind_ew + wind_ns) %>%
naniar::add_label_shadow()
# Plot the imputed values for air_temp_c and humidity, colored by missingness
ggplot(ocean_imp_lm_wind, aes(x = air_temp_c, y = humidity, color = any_missing)) +
geom_point()
# Bind the models together
bound_models <- bind_rows(mean = ocean_imp_mean,
lm_wind = ocean_imp_lm_wind,
.id = "imp_model")
# Inspect the values of air_temp and humidity as a scatterplot
ggplot(bound_models, aes(x = air_temp_c, y = humidity, color = any_missing)) +
geom_point() +
facet_wrap(~imp_model)
# Build a model adding year to the outcome
ocean_imp_lm_wind_year <- bind_shadow(oceanbuoys) %>%
simputation::impute_lm(air_temp_c ~ wind_ew + wind_ns + year) %>%
simputation::impute_lm(humidity ~ wind_ew + wind_ns + year) %>%
naniar::add_label_shadow()
# Bind the mean, lm_wind, and lm_wind_year models together
bound_models <- bind_rows(mean = ocean_imp_mean,
lm_wind = ocean_imp_lm_wind,
lm_wind_year = ocean_imp_lm_wind_year,
.id = "imp_model"
)
# Explore air_temp and humidity, coloring by any missings, and faceting by imputation model
ggplot(bound_models, aes(x = air_temp_c, y = humidity, color = any_missing)) +
geom_point() +
facet_wrap(~imp_model)
# Gather the data and inspect the distributions of the values
bound_models_gather <- bound_models %>%
select(air_temp_c, humidity, any_missing, imp_model) %>%
gather(key = "key", value = "value", -any_missing, -imp_model)
# Inspect the distribution for each variable, for each model
ggplot(bound_models_gather, aes(x = imp_model, y = value, color = imp_model)) +
geom_boxplot() +
facet_wrap(~key, scales = "free_y")
# Inspect the imputed values
bound_models_gather %>%
filter(any_missing == "Missing") %>%
ggplot(aes(x = imp_model, y = value, color = imp_model)) +
geom_boxplot() +
facet_wrap(~key, scales = "free_y")
# Create an imputed dataset using a linear models
ocean_imp_lm_all <- naniar::bind_shadow(oceanbuoys) %>%
naniar::add_label_shadow() %>%
simputation::impute_lm(sea_temp_c ~ wind_ew + wind_ns + year + latitude + longitude) %>%
simputation::impute_lm(air_temp_c ~ wind_ew + wind_ns + year + latitude + longitude) %>%
simputation::impute_lm(humidity ~ wind_ew + wind_ns + year + latitude + longitude)
# Bind the datasets
bound_models <- bind_rows(imp_lm_wind_year = ocean_imp_lm_wind_year,
imp_lm_wind = ocean_imp_lm_wind,
imp_lm_all = ocean_imp_lm_all,
.id = "imp_model"
)
# Look at the models
bound_models
## # A tibble: 2,208 x 18
## imp_model year latitude longitude sea_temp_c air_temp_c humidity wind_ew
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 imp_lm_w~ 1997 0 -110 27.6 27.1 79.6 -6.40
## 2 imp_lm_w~ 1997 0 -110 27.5 27.0 75.8 -5.30
## 3 imp_lm_w~ 1997 0 -110 27.6 27 76.5 -5.10
## 4 imp_lm_w~ 1997 0 -110 27.6 26.9 76.2 -4.90
## 5 imp_lm_w~ 1997 0 -110 27.6 26.8 76.4 -3.5
## 6 imp_lm_w~ 1997 0 -110 27.8 26.9 76.7 -4.40
## 7 imp_lm_w~ 1997 0 -110 28.0 27.0 76.5 -2
## 8 imp_lm_w~ 1997 0 -110 28.0 27.1 78.3 -3.70
## 9 imp_lm_w~ 1997 0 -110 28.0 27.2 78.6 -4.20
## 10 imp_lm_w~ 1997 0 -110 28.0 27.2 76.9 -3.60
## # ... with 2,198 more rows, and 10 more variables: wind_ns <dbl>,
## # year_NA <fct>, latitude_NA <fct>, longitude_NA <fct>, sea_temp_c_NA <fct>,
## # air_temp_c_NA <fct>, humidity_NA <fct>, wind_ew_NA <fct>, wind_ns_NA <fct>,
## # any_missing <chr>
# Create the model summary for each dataset
model_summary <- bound_models %>%
group_by(imp_model) %>%
nest() %>%
mutate(mod = map(data, ~lm(sea_temp_c ~ air_temp_c + humidity + year, data = .)),
res = map(mod, residuals), pred = map(mod, predict), tidy = map(mod, broom::tidy)
)
# Explore the coefficients in the model
model_summary %>%
select(imp_model, tidy) %>%
unnest()
## Warning: `cols` is now required.
## Please use `cols = c(tidy)`
## # A tibble: 12 x 6
## # Groups: imp_model [3]
## imp_model term estimate std.error statistic p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 imp_lm_wind_year (Intercept) -614. 48.5 -12.7 2.49e- 33
## 2 imp_lm_wind_year air_temp_c 0.927 0.0235 39.5 3.25e-183
## 3 imp_lm_wind_year humidity 0.0221 0.00427 5.17 3.05e- 7
## 4 imp_lm_wind_year year 0.308 0.0245 12.6 7.14e- 33
## 5 imp_lm_wind (Intercept) -1742. 56.1 -31.0 1.83e-135
## 6 imp_lm_wind air_temp_c 0.365 0.0279 13.1 2.73e- 35
## 7 imp_lm_wind humidity 0.0225 0.00690 3.26 1.17e- 3
## 8 imp_lm_wind year 0.880 0.0283 31.1 6.79e-136
## 9 imp_lm_all (Intercept) -697. 51.8 -13.5 5.04e- 37
## 10 imp_lm_all air_temp_c 0.890 0.0255 35.0 2.90e-158
## 11 imp_lm_all humidity 0.0127 0.00463 2.75 6.03e- 3
## 12 imp_lm_all year 0.351 0.0262 13.4 1.12e- 36
best_model <- "imp_lm_all"
Chapter 1 - Presidential Job Approval Polls
Introduction:
Averaging Job Approval by President:
Visualizing Trump’s Approval Over Time:
Example code includes:
approval_polls <- readr::read_csv("./RInputFiles/gallup_approval_polls.csv")
## Parsed with column specification:
## cols(
## President = col_character(),
## Date = col_character(),
## Approve = col_double(),
## Disapprove = col_double(),
## Inaug = col_character(),
## Days = col_double()
## )
glimpse(approval_polls)
## Observations: 4,209
## Variables: 6
## $ President <chr> "Trump", "Trump", "Trump", "Trump", "Trump", "Trump", "T...
## $ Date <chr> "12/12/2017", "12/9/2017", "12/6/2017", "12/3/2017", "11...
## $ Approve <dbl> 36, 36, 37, 35, 34, 37, 38, 36, 39, 37, 39, 37, 37, 39, ...
## $ Disapprove <dbl> 59, 59, 59, 60, 60, 56, 55, 57, 56, 57, 55, 57, 57, 57, ...
## $ Inaug <chr> "1/20/17", "1/20/17", "1/20/17", "1/20/17", "1/20/17", "...
## $ Days <dbl> 326, 323, 320, 317, 314, 311, 308, 304, 301, 298, 295, 2...
# Select President, Date, and Approve from approval_polls
approval_polls %>%
select(President, Date, Approve) %>%
head()
## # A tibble: 6 x 3
## President Date Approve
## <chr> <chr> <dbl>
## 1 Trump 12/12/2017 36
## 2 Trump 12/9/2017 36
## 3 Trump 12/6/2017 37
## 4 Trump 12/3/2017 35
## 5 Trump 11/30/2017 34
## 6 Trump 11/27/2017 37
# Select the President, Date, and Approve columns and filter to observations where President is equal to "Trump"
approval_polls %>%
select(President, Date, Approve) %>%
filter(President == "Trump")
## # A tibble: 108 x 3
## President Date Approve
## <chr> <chr> <dbl>
## 1 Trump 12/12/2017 36
## 2 Trump 12/9/2017 36
## 3 Trump 12/6/2017 37
## 4 Trump 12/3/2017 35
## 5 Trump 11/30/2017 34
## 6 Trump 11/27/2017 37
## 7 Trump 11/24/2017 38
## 8 Trump 11/20/2017 36
## 9 Trump 11/17/2017 39
## 10 Trump 11/14/2017 37
## # ... with 98 more rows
# Group the approval_polls dataset by president and summarise a mean of the Approve column
approval_polls %>%
group_by(President) %>%
summarise(Approve = mean(Approve))
## # A tibble: 14 x 2
## President Approve
## <chr> <dbl>
## 1 Bush 1 60.1
## 2 Bush 2 51.0
## 3 Carter 46.1
## 4 Clinton 55.6
## 5 Eisenhower 64.2
## 6 Ford 45.9
## 7 Johnson 55.4
## 8 Kennedy 70.2
## 9 Nixon 47.1
## 10 Obama 47.8
## 11 Reagan 52.5
## 12 Roosevelt 72.3
## 13 Truman 42.1
## 14 Trump 38.6
# Extract, or "pull," the Approve column as a vector and save it to the object "TrumpApproval"
TrumpApproval <- approval_polls %>%
select(President, Date, Approve) %>%
filter(President == "Trump") %>%
pull(Approve)
# Take a mean of the TrumpApproval vector
mean(TrumpApproval)
## [1] 38.62963
# Select the relevant columns from the approval_polls dataset and filter them for the Trump presidency
TrumpPolls <- approval_polls %>%
select(President, Date, Approve) %>%
filter(President == "Trump")
# Use the months() and mdy() function to get the month of the day each poll was taken
# Group the dataset by month and summarize a mean of Trump's job approval by month
TrumpPolls %>%
mutate(Month = months(lubridate::mdy(Date))) %>%
group_by(Month) %>%
summarise(Approve = mean(Approve))
## # A tibble: 12 x 2
## Month Approve
## <chr> <dbl>
## 1 April 40.6
## 2 August 35.7
## 3 December 36
## 4 February 41.6
## 5 January 44
## 6 July 38
## 7 June 37.9
## 8 March 40.5
## 9 May 39.9
## 10 November 37.3
## 11 October 36.7
## 12 September 37.5
# Save Donald Trump's approval polling to a separate object
TrumpApproval <- approval_polls %>%
filter(President == "Trump") %>%
mutate(Date = lubridate::mdy(Date)) %>%
arrange(Date)
# use the rollmean() function from the zoo package to get a moving average of the last 10 polls
TrumpApproval <- TrumpApproval %>%
mutate(AvgApprove = zoo::rollmean(Approve, 10, na.pad=TRUE, align = "right"))
# Use ggplot to graph Trump's average approval over time
ggplot(data = TrumpApproval, aes(x=Date, y=AvgApprove)) +
geom_line()
## Warning: Removed 9 rows containing missing values (geom_path).
# Create an moving average of each president's approval rating
AllApproval <- approval_polls %>%
group_by(President) %>%
mutate(AvgApprove = zoo::rollmean(Approve, 10, na.pad=TRUE, align = "right"))
# Graph an moving average of each president's approval rating
ggplot(data = AllApproval, aes(x=Days, y=AvgApprove, col=President)) +
geom_line()
## Warning: Removed 126 rows containing missing values (geom_path).
Chapter 2 - US House and Senate Polling
Elections and Polling Parties:
73 Years of “Generic Ballot” Polls:
Calculating and Visualizing Error in Polls:
Predicting Winners with Linear Regression:
Example code includes:
generic_ballot <- readr::read_csv("./RInputFiles/generic_ballot.csv")
## Parsed with column specification:
## cols(
## Date = col_character(),
## Democrats = col_double(),
## Republicans = col_double(),
## ElecYear = col_double(),
## ElecDay = col_character(),
## DaysTilED = col_double(),
## DemVote = col_double(),
## RepVote = col_double()
## )
glimpse(generic_ballot)
## Observations: 2,559
## Variables: 8
## $ Date <chr> "7/4/1945", "7/19/1945", "10/23/1945", "11/28/1945", "1...
## $ Democrats <dbl> 44, 38, 36, 40, 40, 40, 50, 37, 37, 37, 35, 39, 38, 38,...
## $ Republicans <dbl> 31, 31, 51, 34, 34, 33, 39, 37, 32, 34, 34, 35, 35, 38,...
## $ ElecYear <dbl> 1946, 1946, 1946, 1946, 1946, 1946, 1946, 1946, 1946, 1...
## $ ElecDay <chr> "11/5/1946", "11/5/1946", "11/5/1946", "11/5/1946", "11...
## $ DaysTilED <dbl> 489, 474, 378, 342, 299, 279, 279, 245, 202, 188, 167, ...
## $ DemVote <dbl> 45.0, 45.0, 45.0, 45.0, 45.0, 45.0, 45.0, 45.0, 45.0, 4...
## $ RepVote <dbl> 53.0, 53.0, 53.0, 53.0, 53.0, 53.0, 53.0, 53.0, 53.0, 5...
# Look at the header and first few rows of the data
head(generic_ballot)
## # A tibble: 6 x 8
## Date Democrats Republicans ElecYear ElecDay DaysTilED DemVote RepVote
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 7/4/1945 44 31 1946 11/5/1946 489 45 53
## 2 7/19/1945 38 31 1946 11/5/1946 474 45 53
## 3 10/23/1945 36 51 1946 11/5/1946 378 45 53
## 4 11/28/1945 40 34 1946 11/5/1946 342 45 53
## 5 1/10/1946 40 34 1946 11/5/1946 299 45 53
## 6 1/30/1946 40 33 1946 11/5/1946 279 45 53
# Filter the election year to 2016 and select the Date, Democrats, and Republicans columns
generic_ballot %>%
filter(ElecYear == 2016) %>%
select(Date, Democrats, Republicans)
## # A tibble: 125 x 3
## Date Democrats Republicans
## <chr> <dbl> <dbl>
## 1 11/16/2014 41 41
## 2 11/23/2014 39 43
## 3 11/30/2014 40 41
## 4 12/7/2014 40 39
## 5 12/14/2014 37 40
## 6 12/21/2014 39 40
## 7 12/28/2014 39 39
## 8 1/4/2015 40 38
## 9 1/11/2015 38 38
## 10 1/18/2015 39 38
## # ... with 115 more rows
# Mutate a new variable called "Democratic.Margin" equal to the difference between Democrats' vote share and Republicans'
democratic_lead <- generic_ballot %>%
mutate(Democratic.Margin = Democrats - Republicans)
# Take a look at that new variable!
democratic_lead %>%
select(Democratic.Margin)
## # A tibble: 2,559 x 1
## Democratic.Margin
## <dbl>
## 1 13
## 2 7
## 3 -15
## 4 6
## 5 6
## 6 7
## 7 11
## 8 0
## 9 5
## 10 3
## # ... with 2,549 more rows
# Group the generic ballot dataset by year and summarise an average of the Democratic.Margin variable
over_time <- democratic_lead %>%
group_by(ElecYear) %>%
summarize(Democratic.Margin = mean(Democratic.Margin))
# Explore the data.frame
head(over_time)
## # A tibble: 6 x 2
## ElecYear Democratic.Margin
## <dbl> <dbl>
## 1 1946 0.524
## 2 1948 -0.333
## 3 1950 11.1
## 4 1952 -0.975
## 5 1954 9.40
## 6 1956 11.1
# Create a month and year variable for averaging polls by approximate date
timeseries <- democratic_lead %>%
mutate(Date = lubridate::mdy(Date), month = lubridate::month(Date), yr = lubridate::year(Date))
# Now group the polls by their month and year, then summarise
timeseries <- timeseries %>%
group_by(yr, month) %>%
summarise(Democratic.Margin = mean(Democratic.Margin))
# Mutate a new variable to use a date summary for the monthly average
timeseries_plot <- timeseries %>%
mutate(time = sprintf("%s-%s-%s", yr, month, "01"))
# Plot the line over time
ggplot(timeseries_plot, aes(x=lubridate::ymd(time), y=Democratic.Margin)) +
geom_line()
# Make a ggplot with points for monthly polling averages and one trend line running through the entire time series
ggplot(timeseries_plot, aes(x=lubridate::ymd(time), y=Democratic.Margin)) +
geom_point() +
geom_smooth(span=0.2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Mutate two variables for the Democrats' margin in polls and election day votes
poll_error <- generic_ballot %>%
mutate(Dem.Poll.Margin = Democrats - Republicans,
Dem.Vote.Margin = DemVote - RepVote
)
# Average those two variables per year and mutate the "error" variable
poll_error <- poll_error %>%
group_by(ElecYear) %>%
summarise(Dem.Poll.Margin = mean(Dem.Poll.Margin), Dem.Vote.Margin = mean(Dem.Vote.Margin)) %>%
mutate(error = Dem.Poll.Margin - Dem.Vote.Margin)
# Calculate the room-mean-square error of the error variable
rmse <- sqrt(mean(poll_error$error^2))
# Multiply the RMSE by 1.96 to get the 95% confidence interval, or "margin of error"
CI <- rmse * 1.96
# Add variables to our dataset for the upper and lower bound of the `Dem.Poll.Margin` variable
by_year <- poll_error %>%
mutate(upper = Dem.Poll.Margin + CI, lower = Dem.Poll.Margin - CI)
# Plot estimates for Dem.Poll.Margin and Dem.Vote.Margin on the y axis for each year on the x axis with geom_point
ggplot(by_year) +
geom_point(aes(x=ElecYear, y=Dem.Poll.Margin, col="Poll")) +
geom_point(aes(x=ElecYear, y=Dem.Vote.Margin, col="Vote")) +
geom_errorbar(aes(x=ElecYear, ymin=lower, ymax=upper))
# Fit a model predicting Democratic vote margin with Democratic poll margin
model <- lm(Dem.Vote.Margin ~ Dem.Poll.Margin, data=by_year)
# Evaluate the model
summary(model)
##
## Call:
## lm(formula = Dem.Vote.Margin ~ Dem.Poll.Margin, data = by_year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9701 -3.2791 -0.1947 3.0657 10.4314
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.80939 1.19857 -0.675 0.504
## Dem.Poll.Margin 0.52693 0.09582 5.499 3.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.758 on 34 degrees of freedom
## Multiple R-squared: 0.4708, Adjusted R-squared: 0.4552
## F-statistic: 30.24 on 1 and 34 DF, p-value: 3.855e-06
# Make a new data.frame that has our prediction variable and value
predictdata <- data.frame("Dem.Poll.Margin" = 5)
# Make the prediction with the coefficients from our model
predict(model, predictdata)
## 1
## 1.825248
Chapter 3 - Election Results and Political Demography
2016 Presidential Election:
Making County-Level Maps in R:
Analyzing Results with Linear Regression:
2016 Brexit Referendum:
Example code includes:
uspres_results <- readr::read_csv("./RInputFiles/us_pres_2016_by_county.csv")
## Parsed with column specification:
## cols(
## county.fips = col_double(),
## county.name = col_character(),
## state.name = col_character(),
## party = col_character(),
## vote.count = col_double(),
## county.total.count = col_double(),
## national.party.percent = col_double(),
## national.count = col_double(),
## is.national.winner = col_logical()
## )
glimpse(uspres_results)
## Observations: 9,297
## Variables: 9
## $ county.fips <dbl> 45001, 45001, 45001, 22001, 22001, 22001, 51...
## $ county.name <chr> "abbeville", "abbeville", "abbeville", "acad...
## $ state.name <chr> "south carolina", "south carolina", "south c...
## $ party <chr> "D", "O", "R", "D", "O", "R", "D", "O", "R",...
## $ vote.count <dbl> 3741, 271, 6763, 5638, 589, 21162, 6740, 495...
## $ county.total.count <dbl> 10775, 10775, 10775, 27389, 27389, 27389, 15...
## $ national.party.percent <dbl> 48.098104, 5.789663, 46.112232, 48.098104, 5...
## $ national.count <dbl> 135851595, 135851595, 135851595, 135851595, ...
## $ is.national.winner <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALS...
# Deselect the is.national.winner, national.count, and national.party.percent variables
uspres_results.slim <- uspres_results %>%
select(-c(is.national.winner, national.count, national.party.percent))
# Spread party and votes to their own columns
uspres_county <- uspres_results.slim %>%
tidyr::spread(key=party,value=vote.count)
# Add a variable to the uspres_county dataset to store the Democrat's percentage of votes
uspres_county <- uspres_county %>%
mutate(Dem.pct = D/county.total.count)
# Load the county demographic data
data(df_county_demographics, package="choroplethr")
# Look at the demographic data
head(df_county_demographics)
## region total_population percent_white percent_black percent_asian
## 1 1001 54907 76 18 1
## 2 1003 187114 83 9 1
## 3 1005 27321 46 46 0
## 4 1007 22754 75 22 0
## 5 1009 57623 88 1 0
## 6 1011 10746 22 71 0
## percent_hispanic per_capita_income median_rent median_age
## 1 2 24571 668 37.5
## 2 4 26766 693 41.5
## 3 5 16829 382 38.3
## 4 2 17427 351 39.4
## 5 8 20730 403 39.6
## 6 6 18628 276 39.6
# Rename the 'region' variable in df_county_demographics to "county.fips"
df_county_demographics <- df_county_demographics %>%
rename("county.fips" = region)
# Join county demographic with vote share data via its FIPS code
county_merged <- left_join(df_county_demographics, uspres_county, by = "county.fips")
head(county_merged)
## county.fips total_population percent_white percent_black percent_asian
## 1 1001 54907 76 18 1
## 2 1003 187114 83 9 1
## 3 1005 27321 46 46 0
## 4 1007 22754 75 22 0
## 5 1009 57623 88 1 0
## 6 1011 10746 22 71 0
## percent_hispanic per_capita_income median_rent median_age county.name
## 1 2 24571 668 37.5 autauga
## 2 4 26766 693 41.5 baldwin
## 3 5 16829 382 38.3 barbour
## 4 2 17427 351 39.4 bibb
## 5 8 20730 403 39.6 blount
## 6 6 18628 276 39.6 bullock
## state.name county.total.count D O R Dem.pct
## 1 alabama 24973 5936 865 18172 0.23769671
## 2 alabama 95215 18458 3874 72883 0.19385601
## 3 alabama 10469 4871 144 5454 0.46527844
## 4 alabama 8819 1874 207 6738 0.21249575
## 5 alabama 25588 2156 573 22859 0.08425825
## 6 alabama 4710 3530 40 1140 0.74946921
# plot percent_white and Dem.pct on the x and y axes. add points and a trend line
ggplot(county_merged, aes(x=percent_white, y=Dem.pct)) +
geom_point() +
geom_smooth(method="lm")
## Warning: Removed 44 rows containing non-finite values (stat_smooth).
## Warning: Removed 44 rows containing missing values (geom_point).
# Rename the county.fips and Dem.pct variables from our dataset to "region" and "value"
county_map <- county_merged %>%
rename("region" = county.fips, "value" = Dem.pct)
# Create the map with choroplethrMaps's county_choropleth()
democratic_map <- choroplethr::county_choropleth(county_map)
## Warning in self$bind(): The following regions were missing and are being set
## to NA: 17049, 17065, 2050, 17079, 2105, 17175, 2122, 17101, 17127, 2150, 17159,
## 17165, 2164, 2180, 2188, 2240, 2090, 2198, 15005, 2100, 2170, 17151, 2016, 2060,
## 2290, 17059, 17067, 2282, 17185, 2070, 2110, 2130, 2185, 2195, 2220, 2230, 2020,
## 2068, 2013, 2261, 2270, 2275, 17003, 17047
# Print the map
democratic_map
# Rename variables from our dataset
county_map <- county_merged %>%
rename("region" = county.fips, "value" = percent_white)
# Create the map with choroplethr's county_choropleth()
white_map <- choroplethr::county_choropleth(county_map)
# Graph the two maps (democratic_map and white_map) from the previous exercises side-by-side
gridExtra::grid.arrange(democratic_map, white_map)
# Fit a linear model to predict Dem.pct dependent on percent_white in each county
fit <- lm(Dem.pct ~ percent_white, data=county_merged)
# Evaluate the model
summary(fit)
##
## Call:
## lm(formula = Dem.pct ~ percent_white, data = county_merged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.39987 -0.08303 -0.00903 0.07281 0.47761
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6719046 0.0090408 74.32 <2e-16 ***
## percent_white -0.0045684 0.0001123 -40.68 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1227 on 3097 degrees of freedom
## (44 observations deleted due to missingness)
## Multiple R-squared: 0.3482, Adjusted R-squared: 0.348
## F-statistic: 1655 on 1 and 3097 DF, p-value: < 2.2e-16
# Fit a linear model to predict Dem.pct dependent on percent_white and per_capita_income in each county
fit <- lm(Dem.pct ~ percent_white + per_capita_income, data=county_merged)
# Evaluate the model
summary(fit)
##
## Call:
## lm(formula = Dem.pct ~ percent_white + per_capita_income, data = county_merged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.48529 -0.06838 0.00214 0.06847 0.39700
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.006e-01 1.094e-02 45.77 <2e-16 ***
## percent_white -5.080e-03 1.053e-04 -48.25 <2e-16 ***
## per_capita_income 8.961e-06 3.727e-07 24.05 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1127 on 3096 degrees of freedom
## (44 observations deleted due to missingness)
## Multiple R-squared: 0.4508, Adjusted R-squared: 0.4505
## F-statistic: 1271 on 2 and 3096 DF, p-value: < 2.2e-16
brexit_polls <- readr::read_csv("./RInputFiles/brexit_polls.csv")
## Parsed with column specification:
## cols(
## Date = col_character(),
## Remain = col_double(),
## Leave = col_double()
## )
glimpse(brexit_polls)
## Observations: 35
## Variables: 3
## $ Date <chr> "6/23/16", "6/22/16", "6/22/16", "6/22/16", "6/22/16", "6/22...
## $ Remain <dbl> 52, 55, 51, 49, 44, 54, 48, 41, 45, 42, 53, 45, 44, 44, 42, ...
## $ Leave <dbl> 48, 45, 49, 46, 45, 46, 42, 43, 44, 44, 46, 42, 43, 44, 44, ...
brexit_results <- readr::read_csv("./RInputFiles/brexit_results.csv")
## Parsed with column specification:
## cols(
## Seat = col_character(),
## con_2015 = col_double(),
## lab_2015 = col_double(),
## ld_2015 = col_double(),
## ukip_2015 = col_double(),
## leave_share = col_double(),
## born_in_uk = col_double(),
## male = col_double(),
## unemployed = col_double(),
## degree = col_double(),
## age_18to24 = col_double()
## )
glimpse(brexit_results)
## Observations: 632
## Variables: 11
## $ Seat <chr> "Aldershot", "Aldridge-Brownhills", "Altrincham and Sal...
## $ con_2015 <dbl> 50.592, 52.050, 52.994, 43.979, 60.788, 22.418, 52.454,...
## $ lab_2015 <dbl> 18.333, 22.369, 26.686, 34.781, 11.197, 41.022, 18.441,...
## $ ld_2015 <dbl> 8.824, 3.367, 8.383, 2.975, 7.192, 14.828, 5.984, 2.423...
## $ ukip_2015 <dbl> 17.867, 19.624, 8.011, 15.887, 14.438, 21.409, 18.821, ...
## $ leave_share <dbl> 57.89777, 67.79635, 38.58780, 65.29912, 49.70111, 70.47...
## $ born_in_uk <dbl> 83.10464, 96.12207, 90.48566, 97.30437, 93.33793, 96.96...
## $ male <dbl> 49.89896, 48.92951, 48.90621, 49.21657, 48.00189, 49.17...
## $ unemployed <dbl> 3.637000, 4.553607, 3.039963, 4.261173, 2.468100, 4.742...
## $ degree <dbl> 13.870661, 9.974114, 28.600135, 9.336294, 18.775591, 6....
## $ age_18to24 <dbl> 9.406093, 7.325850, 6.437453, 7.747801, 5.734730, 8.209...
# Filter the dataset to polls only released after June 16th, 2016, and mutate a variable for the Remain campaign's lead
brexit_average <- brexit_polls %>%
filter(lubridate::mdy(Date)>lubridate::ymd("2016-06-16") )%>%
mutate(RemainLead = Remain - Leave)
# Average the last seven days of polling
mean(brexit_average$RemainLead)
## [1] 2.857143
# Summarise the Remain lead from the entire month of the referendum
ggplot(brexit_polls, aes(x=lubridate::mdy(Date), y=Remain-Leave)) +
geom_point() +
geom_smooth(method='loess')
# Familiarize yourself with the data using the head() function
head(brexit_results)
## # A tibble: 6 x 11
## Seat con_2015 lab_2015 ld_2015 ukip_2015 leave_share born_in_uk male
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Alde~ 50.6 18.3 8.82 17.9 57.9 83.1 49.9
## 2 Aldr~ 52.0 22.4 3.37 19.6 67.8 96.1 48.9
## 3 Altr~ 53.0 26.7 8.38 8.01 38.6 90.5 48.9
## 4 Ambe~ 44.0 34.8 2.98 15.9 65.3 97.3 49.2
## 5 Arun~ 60.8 11.2 7.19 14.4 49.7 93.3 48.0
## 6 Ashf~ 22.4 41.0 14.8 21.4 70.5 97.0 49.2
## # ... with 3 more variables: unemployed <dbl>, degree <dbl>, age_18to24 <dbl>
# Chart the counstituency-by-constituency relationship between voting for the Labour Party and voting to leave the EU
ggplot(brexit_results,aes(x=lab_2015, y=leave_share)) +
geom_point()
# Show the relationship between UKIP and Leave vote share with points and a line representing the linear relationship between the variables
ggplot(brexit_results,aes(x=ukip_2015, y=leave_share)) +
geom_point() +
geom_smooth(method = "lm")
# predict leave's share with the percentage of a constituency that holds a college degree and its 2015 UKIP vote share
model.multivar <- lm(leave_share ~ ukip_2015 + degree, brexit_results)
summary(model.multivar)
##
## Call:
## lm(formula = leave_share ~ ukip_2015 + degree, data = brexit_results)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.976 -2.076 0.283 2.479 10.280
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.52977 1.02503 55.15 <2e-16 ***
## ukip_2015 0.72948 0.04250 17.16 <2e-16 ***
## degree -0.80594 0.02833 -28.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.718 on 570 degrees of freedom
## (59 observations deleted due to missingness)
## Multiple R-squared: 0.8821, Adjusted R-squared: 0.8817
## F-statistic: 2133 on 2 and 570 DF, p-value: < 2.2e-16
Chapter 4 - Predicting the Future of Politics
US House 2018:
Training a Model to Predict Future with Polls:
Presidency in 2020:
Wrap-up:
Example code includes:
polls_2018 <- tibble::tibble(Democrat=c(45, 47, 49, 44, 41, 48, 45, 44, 45, 51, 42, 52, 46, 44, 41, 42, 44, 42, 44, 44, 42, 41, 51, 47, 49, 45, 44, 45, 48, 42, 42, 47, 42, 44, 47, 43, 50, 43, 43, 41, 45, 44, 44, 42, 45, 50, 48, 48, 43, 45, 48, 46, 48, 44, 43, 44, 49, 42, 39, 42, 44, 43, 40, 42, 42, 38, 43, 44, 39, 42, 47, 42, 43, 43, 48, 49, 46, 43, 43, 45, 44, 43, 44, 44, 47, 44, 38, 42, 43, 43, 41, 41, 42, 42, 50, 50, 44, 46, 44, 40, 42, 43, 38, 43, 49, 43, 38, 50, 44, 40, 37, 41, 47, 54, 46, 43, 38, 42, 39, 38, 49, 49, 43, 38, 42, 45, 47, 42, 37, 41, 38, 43, 42, 51, 51, 42, 37, 41, 53, 46, 44, 40, 44, 42, 38, 44, 44, 39, 44, 56, 51, 44, 51, 37, 41, 50, 42, 37, 40, 41, 36, 42, 37, 42, 43, 43, 42, 38, 44, 51, 40, 38, 38, 51, 51, 39, 40, 43, 44, 50, 40, 36, 42, 41, 42, 54, 40, 43, 39, 41, 40, 48, 42, 49, 39, 43, 40, 40, 39, 43, 40, 40, 39, 49, 41, 46, 41, 40, 47, 39, 51, 43, 39, 44, 40, 40, 40, 50, 42, 39, 43, 37, 43, 47, 41, 48, 42, 38, 43, 38, 42, 50, 41, 42, 39, 43, 38, 41, 40, 42, 49, 42, 40, 42, 38, 41, 47, 39, 50, 40, 47, 47, 38, 40, 45, 40, 43, 41, 48, 47, 46, 46, 49, 45, 48),
Republican=c(39, 34, 38, 38, 37, 43, 36, 40, 36, 42, 38, 41, 39, 41, 39, 36, 40, 36, 37, 38, 37, 38, 39, 40, 37, 37, 35, 37, 40, 38, 37, 34, 38, 34, 37, 40, 41, 35, 38, 38, 36, 37, 37, 36, 39, 42, 43, 41, 37, 35, 43, 40, 39, 38, 32, 37, 42, 39, 37, 37, 37, 38, 37, 36, 37, 37, 38, 35, 38, 35, 44, 39, 37, 34, 43, 41, 38, 34, 38, 34, 35, 38, 34, 39, 43, 36, 30, 36, 36, 34, 37, 37, 35, 35, 39, 44, 39, 41, 38, 37, 38, 38, 30, 36, 40, 37, 31, 41, 37, 38, 31, 39, 32, 38, 39, 35, 30, 38, 30, 39, 41, 38, 37, 31, 38, 39, 45, 37, 31, 37, 30, 37, 37, 39, 41, 36, 32, 38, 39, 40, 37, 31, 36, 36, 31, 36, 35, 27, 34, 38, 40, 36, 36, 29, 36, 37, 36, 31, 38, 35, 30, 36, 28, 33, 36, 40, 34, 31, 36, 36, 33, 30, 39, 40, 40, 36, 31, 38, 37, 35, 33, 28, 36, 33, 37, 38, 33, 37, 32, 39, 34, 37, 38, 43, 33, 37, 31, 37, 33, 38, 32, 36, 33, 35, 36, 40, 33, 38, 40, 34, 42, 36, 34, 37, 34, 40, 34, 40, 40, 32, 40, 35, 39, 41, 35, 38, 40, 35, 37, 36, 39, 40, 37, 39, 33, 39, 36, 37, 33, 35, 38, 37, 35, 36, 35, 41, 42, 34, 41, 44, 42, 41, 32, 40, 38, 37, 40, 39, 43, 38, 41, 43, 41, 42, 40),
end_date=as.Date(c('2018-08-28', '2018-08-28', '2018-08-21', '2018-08-21', '2018-08-21', '2018-08-19', '2018-08-18', '2018-08-14', '2018-08-14', '2018-08-13', '2018-08-12', '2018-08-12', '2018-08-12', '2018-08-07', '2018-08-07', '2018-08-06', '2018-07-31', '2018-07-31', '2018-07-30', '2018-07-24', '2018-07-24', '2018-07-23', '2018-07-23', '2018-07-22', '2018-07-22', '2018-07-17', '2018-07-17', '2018-07-14', '2018-07-11', '2018-07-10', '2018-07-10', '2018-07-10', '2018-07-03', '2018-07-02', '2018-07-02', '2018-07-01', '2018-07-01', '2018-06-29', '2018-06-26', '2018-06-26', '2018-06-25', '2018-06-24', '2018-06-19', '2018-06-19', '2018-06-18', '2018-06-17', '2018-06-17', '2018-06-13', '2018-06-12', '2018-06-12', '2018-06-12', '2018-06-10', '2018-06-06', '2018-06-05', '2018-06-05', '2018-06-04', '2018-05-30', '2018-05-29', '2018-05-29', '2018-05-29', '2018-05-22', '2018-05-22', '2018-05-22', '2018-05-19', '2018-05-15', '2018-05-15', '2018-05-14', '2018-05-08', '2018-05-08', '2018-05-07', '2018-05-05', '2018-05-01', '2018-05-01', '2018-05-01', '2018-05-01', '2018-04-30', '2018-04-30', '2018-04-24', '2018-04-24', '2018-04-24', '2018-04-23', '2018-04-17', '2018-04-17', '2018-04-13', '2018-04-11', '2018-04-10', '2018-04-10', '2018-04-07', '2018-04-03', '2018-04-03', '2018-04-01', '2018-03-27', '2018-03-27', '2018-03-27', '2018-03-25', '2018-03-25', '2018-03-21', '2018-03-21', '2018-03-20', '2018-03-20', '2018-03-19', '2018-03-13', '2018-03-13', '2018-03-12', '2018-03-08', '2018-03-06', '2018-03-06', '2018-03-05', '2018-03-05', '2018-02-27', '2018-02-27', '2018-02-26', '2018-02-24', '2018-02-23', '2018-02-21', '2018-02-20', '2018-02-20', '2018-02-13', '2018-02-13', '2018-02-12', '2018-02-11', '2018-02-07', '2018-02-06', '2018-02-06', '2018-02-04', '2018-02-01', '2018-01-30', '2018-01-30', '2018-01-30', '2018-01-23', '2018-01-23', '2018-01-21', '2018-01-20', '2018-01-18', '2018-01-18', '2018-01-16', '2018-01-16', '2018-01-16', '2018-01-15', '2018-01-10', '2018-01-09', '2018-01-09', '2018-01-05', '2018-01-02', '2018-01-02', '2017-12-26', '2017-12-19', '2017-12-19', '2017-12-18', '2017-12-17', '2017-12-12', '2017-12-12', '2017-12-12', '2017-12-12', '2017-12-11', '2017-12-07', '2017-12-05', '2017-12-05', '2017-12-03', '2017-11-28', '2017-11-28', '2017-11-25', '2017-11-22', '2017-11-21', '2017-11-19', '2017-11-15', '2017-11-14', '2017-11-14', '2017-11-11', '2017-11-09', '2017-11-07', '2017-11-07', '2017-11-06', '2017-11-05', '2017-11-01', '2017-10-31', '2017-10-31', '2017-10-30', '2017-10-30', '2017-10-24', '2017-10-24', '2017-10-24', '2017-10-23', '2017-10-16', '2017-10-16', '2017-10-15', '2017-10-10', '2017-10-09', '2017-10-03', '2017-10-01', '2017-09-26', '2017-09-25', '2017-09-24', '2017-09-20', '2017-09-19', '2017-09-17', '2017-09-12', '2017-09-11', '2017-09-05', '2017-09-03', '2017-08-29', '2017-08-28', '2017-08-22', '2017-08-21', '2017-08-19', '2017-08-17', '2017-08-15', '2017-08-14', '2017-08-12', '2017-08-08', '2017-08-06', '2017-08-06', '2017-08-01', '2017-07-29', '2017-07-25', '2017-07-24', '2017-07-18', '2017-07-17', '2017-07-15', '2017-07-11', '2017-07-09', '2017-07-04', '2017-06-30', '2017-06-27', '2017-06-27', '2017-06-25', '2017-06-24', '2017-06-20', '2017-06-19', '2017-06-13', '2017-06-12', '2017-06-11', '2017-06-06', '2017-06-02', '2017-05-30', '2017-05-30', '2017-05-23', '2017-05-22', '2017-05-16', '2017-05-14', '2017-05-14', '2017-05-11', '2017-05-09', '2017-05-06', '2017-05-02', '2017-04-30', '2017-04-25', '2017-04-25', '2017-04-25', '2017-04-24', '2017-04-20', '2017-04-18', '2017-04-18', '2017-04-15', '2017-04-12', '2017-04-11', '2017-04-09', '2017-04-01', '2017-03-28', '2017-03-27', '2017-03-12', '2017-02-22', '2017-02-08', '2017-01-31', '2017-01-24'))
)
# average all of the generic ballot polls that have been taken since June
polls_2018 %>%
filter(lubridate::month(end_date) > 6, lubridate::year(end_date)==2018) %>%
mutate(Dem.Margin = Democrat - Republican) %>%
pull(Dem.Margin) %>%
mean()
## [1] 7.081081
# Filter the dataset to include polls from August and September
# Mutate a variable for the Democratic vote margin in that year
polls_predict <- generic_ballot %>%
filter(lubridate::month(lubridate::mdy(Date)) %in% c(8, 9), ElecYear >= 1980) %>%
mutate(Dem.Poll.Margin = Democrats - Republicans,
Dem.Vote.Margin = DemVote - RepVote
) %>%
group_by(ElecYear) %>%
summarise(Dem.Poll.Margin = mean(Dem.Poll.Margin),
Dem.Vote.Margin = mean(Dem.Vote.Margin)
) %>%
arrange(ElecYear) %>%
mutate(error=Dem.Poll.Margin - Dem.Vote.Margin,
party_in_power=c(-1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1)
)
# Fit a model to predict Democrats' November vote margin with the Democratic poll margin and party in power variable
model <- lm(Dem.Vote.Margin ~ Dem.Poll.Margin + party_in_power, data=polls_predict)
# Evaluate the model
summary(model)
##
## Call:
## lm(formula = Dem.Vote.Margin ~ Dem.Poll.Margin + party_in_power,
## data = polls_predict)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3893 -2.4283 -0.2004 2.4982 4.6166
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.1168 1.1244 -1.883 0.078079 .
## Dem.Poll.Margin 0.8856 0.2070 4.278 0.000577 ***
## party_in_power -2.1348 0.8809 -2.423 0.027601 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.238 on 16 degrees of freedom
## Multiple R-squared: 0.7498, Adjusted R-squared: 0.7185
## F-statistic: 23.98 on 2 and 16 DF, p-value: 1.535e-05
# Make a prediction for November if Democrats are up 7.5 points in the generic ballot and the party_in_power is the Republicans (-1)
predict(model, data.frame(Dem.Poll.Margin = 7.5, party_in_power=-1))
## 1
## 6.660162
# Multiply the root-mean-square error by 1.96
sqrt(mean(c(model$fitted.values - polls_predict$Dem.Vote.Margin)^2)) * 1.96
## [1] 5.823251
pres_elecs <- tibble::tibble(Year=c(2016, 2012, 2008, 2004, 2000, 1996, 1992, 1988, 1984, 1980, 1976, 1972, 1968, 1964, 1960, 1956, 1952, 1948),
q2_gdp=c(2.3, 1.3, 1.3, 2.6, 8, 7.1, 4.3, 5.2, 7.1, -7.9, 3, 9.8, 7, 4.7, -1.9, 3.2, 0.4, 7.5),
pres_approve=c(7, -0.8, -37, -0.5, 19.5, 15.5, -18, 10, 20, -21.7, 5, 26, -5, 60.3, 37, 53.5, -27, -6),
two_plus_terms=c(1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1),
vote_share=c(51.1, 51.96, 46.3, 51.2, 50.3, 54.7, 46.5, 53.9, 59.2, 44.7, 48.9, 61.8, 49.6, 61.3, 49.9, 57.8, 44.5, 52.4)
)
pres_elecs
## # A tibble: 18 x 5
## Year q2_gdp pres_approve two_plus_terms vote_share
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2016 2.3 7 1 51.1
## 2 2012 1.3 -0.8 0 52.0
## 3 2008 1.3 -37 1 46.3
## 4 2004 2.6 -0.5 0 51.2
## 5 2000 8 19.5 1 50.3
## 6 1996 7.1 15.5 0 54.7
## 7 1992 4.3 -18 1 46.5
## 8 1988 5.2 10 1 53.9
## 9 1984 7.1 20 0 59.2
## 10 1980 -7.9 -21.7 0 44.7
## 11 1976 3 5 1 48.9
## 12 1972 9.8 26 0 61.8
## 13 1968 7 -5 1 49.6
## 14 1964 4.7 60.3 0 61.3
## 15 1960 -1.9 37 1 49.9
## 16 1956 3.2 53.5 0 57.8
## 17 1952 0.4 -27 1 44.5
## 18 1948 7.5 -6 1 52.4
# Make a plot with points representing a year's presidential approval and vote share and a line running through them to show the linear relationship
ggplot(pres_elecs, aes(x=pres_approve, y=vote_share, label=Year)) +
geom_text() +
geom_smooth(method='lm')
# Make a model that predict the vote_share variable with pres_approve, q2_gdp, and two_plus_terms
fit <- lm(vote_share ~ pres_approve + q2_gdp + two_plus_terms, pres_elecs)
# Evaluate the model
summary(fit)
##
## Call:
## lm(formula = vote_share ~ pres_approve + q2_gdp + two_plus_terms,
## data = pres_elecs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5897 -1.1466 -0.1435 1.6203 2.5569
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.43054 0.80876 63.592 < 2e-16 ***
## pres_approve 0.10132 0.02048 4.947 0.000215 ***
## q2_gdp 0.56573 0.11697 4.837 0.000264 ***
## two_plus_terms -4.04250 0.99566 -4.060 0.001170 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.894 on 14 degrees of freedom
## Multiple R-squared: 0.8944, Adjusted R-squared: 0.8718
## F-statistic: 39.53 on 3 and 14 DF, p-value: 4.379e-07
# Save the predicted vote shares to a variable called predict
pres_elecs$predict <- predict(fit, pres_elecs)
# Graph the predictions and vote shares with a label for each election year
ggplot(pres_elecs,aes(x=predict, y=vote_share, label=Year)) +
geom_abline() +
geom_text()
# Calculate the model's root-mean-square error
sqrt(mean(c(pres_elecs$predict - pres_elecs$vote_share)^2)) * 1.96
## [1] 3.273301
# Make a prediction for hypothetical data
predict(fit, data.frame(pres_approve=-15, q2_gdp=2, two_plus_terms=0))
## 1
## 51.04215
Chapter 1 - Census Data in R with tidycensus
Overview:
Basic tidycensus functionality:
Searching for data with tidycensus:
Visualizing census data with ggplot2:
Example code includes:
# Load the tidycensus package into your R session
library(tidycensus)
# Define your Census API key and set it with census_api_key()
api_key <- "INSERT HERE"
census_api_key(api_key)
# Check your API key
Sys.getenv("CENSUS_API_KEY")
# Obtain and view state populations from the 2010 US Census
state_pop <- get_decennial(geography = "state", variables = "P001001")
head(state_pop)
# Obtain and view state median household income from the 2012-2016 American Community Survey
state_income <- get_acs(geography = "state", variables = "B19013_001")
head(state_income)
# Get an ACS dataset for Census tracts in Texas by setting the state
tx_income <- get_acs(geography = "tract",
variables = "B19013_001",
state = "TX")
# Inspect the dataset
head(tx_income)
# Get an ACS dataset for Census tracts in Travis County, TX
travis_income <- get_acs(geography = "tract",
variables = "B19013_001",
state = "TX",
county = "Travis")
# Inspect the dataset
head(travis_income)
# Supply custom variable names
travis_income2 <- get_acs(geography = "tract",
variables = c(hhincome = "B19013_001"),
state = "TX",
county = "Travis")
# Inspect the dataset
head(travis_income2)
# Return county data in wide format
or_wide <- get_acs(geography = "county", state = "OR",
variables = c(hhincome = "B19013_001", medage = "B01002_001"),
output = "wide"
)
# Compare output to the tidy format from previous exercises
head(or_wide)
# Create a scatterplot
plot(or_wide$hhincomeE, or_wide$medageE)
# Load variables from the 2012-2016 ACS
v16 <- load_variables(year = 2016, dataset = "acs5", cache = TRUE)
# Get variables from the ACS Data Profile
v16p <- load_variables(year = 2016, dataset = "acs5/profile", cache = TRUE)
# Set year and dataset to get variables from the 2000 Census SF3
v00 <- load_variables(year = 2000, dataset = "sf3", cache = TRUE)
# Filter for table B19001
filter(v16, str_detect(name, "B19001"))
# Use public transportation to search for related variables
filter(v16p, str_detect(label, fixed("public transportation", ignore_case = TRUE)))
# Access the 1-year ACS with the survey parameter
ne_income <- get_acs(geography = "state",
variables = "B19013_001",
survey = "acs1",
state = c("ME", "NH", "VT", "MA",
"RI", "CT", "NY"))
# Create a dot plot
ggplot(ne_income, aes(x = estimate, y = NAME)) +
geom_point()
# Reorder the states in descending order of estimates
ggplot(ne_income, aes(x = estimate, y = reorder(NAME, estimate))) +
geom_point()
# Set dot color and size
g_color <- ggplot(ne_income, aes(x = estimate, y = reorder(NAME, estimate))) +
geom_point(color = "navy", size = 4)
# Format the x-axis labels
g_scale <- g_color +
scale_x_continuous(labels = scales::dollar) +
theme_minimal(base_size = 18)
# Label your x-axis, y-axis, and title your chart
g_label <- g_scale +
labs(x = "2016 ACS estimate", y = "", title = "Median household income by state")
g_label
Chapter 2 - Wrangling US Census Data
Tables and summary variables in tidycensus:
Census data wrangling with tidy tools:
Working with margins of error in tidycensus:
Visualizing margins of error from ACS:
Example code includes:
library(tidycensus)
# Download table "B19001"
wa_income <- get_acs(geography = "county",
state = "WA",
table = "B19001")
# Check out the first few rows of wa_income
head(wa_income)
# Assign Census variables vector to race_vars
race_vars <- c(White = "B03002_003", Black = "B03002_004", Native = "B03002_005",
Asian = "B03002_006", HIPI = "B03002_007", Hispanic = "B03002_012"
)
# Request a summary variable from the ACS
ca_race <- get_acs(geography = "county",
state = "CA",
variables = race_vars,
summary_var = "B03002_001")
# Calculate a new percentage column and check the result
ca_race_pct <- ca_race %>%
mutate(pct = 100 * (estimate / summary_est))
head(ca_race_pct)
# Group the dataset and filter the estimate
ca_largest <- ca_race %>%
group_by(GEOID) %>%
filter(estimate == max(estimate))
head(ca_largest)
# Group the dataset and get a breakdown of the results
ca_largest %>%
group_by(variable) %>%
tally()
# Use a tidy workflow to wrangle ACS data
wa_grouped <- wa_income %>%
filter(variable != "B19001_001") %>%
mutate(incgroup = case_when(
variable < "B19001_008" ~ "below35k",
variable < "B19001_013" ~ "35kto75k",
TRUE ~ "above75k"
)) %>%
group_by(NAME, incgroup) %>%
summarize(group_est = sum(estimate))
wa_grouped
# Map through ACS1 estimates to see how they change through the years
mi_cities <- map_df(2012:2016, function(x) {
get_acs(geography = "place",
variables = c(totalpop = "B01003_001"),
state = "MI",
survey = "acs1",
year = x) %>%
mutate(year = x)
})
mi_cities %>% arrange(NAME, year)
# Get data on elderly poverty by Census tract in Vermont
vt_eldpov <- get_acs(geography = "tract",
variables = c(eldpovm = "B17001_016",
eldpovf = "B17001_030"),
state = "VT")
vt_eldpov
# Identify rows with greater margins of error than their estimates
moe_check <- filter(vt_eldpov, moe > estimate)
# Check proportion of rows where the margin of error exceeds the estimate
nrow(moe_check) / nrow(vt_eldpov)
# Calculate a margin of error for a sum
moe_sum(moe = c(55, 33, 44, 12, 4))
# Calculate a margin of error for a product
moe_product(est1 = 55, est2 = 33, moe1 = 12, moe2 = 9)
# Calculate a margin of error for a ratio
moe_ratio(num = 1000, denom = 950, moe_num = 200, moe_denom = 177)
# Calculate a margin of error for a proportion
moe_prop(num = 374, denom = 1200, moe_num = 122, moe_denom = 333)
# Group the dataset and calculate a derived margin of error
vt_eldpov2 <- vt_eldpov %>%
group_by(GEOID) %>%
summarize(
estmf = sum(estimate),
moemf = moe_sum(moe = moe, estimate = estimate)
)
# Filter rows where newly-derived margin of error exceeds newly-derived estimate
moe_check2 <- filter(vt_eldpov2, moemf > estmf)
# Check proportion of rows where margin of error exceeds estimate
nrow(moe_check2) / nrow(vt_eldpov2)
# Request median household income data
maine_inc <- get_acs(geography = "county",
variables = c(hhincome = "B19013_001"),
state = "ME")
# Generate horizontal error bars with dots
ggplot(maine_inc, aes(x = estimate, y = NAME)) +
geom_errorbarh(aes(xmin = estimate - moe, xmax = estimate + moe)) +
geom_point()
# Remove unnecessary content from the county's name
maine_inc2 <- maine_inc %>%
mutate(NAME = str_replace(NAME, " County, Maine", ""))
# Build a margin of error plot incorporating your modifications
ggplot(maine_inc2, aes(x = estimate, y = reorder(NAME, estimate))) +
geom_errorbarh(aes(xmin = estimate - moe, xmax = estimate + moe)) +
geom_point(size = 3, color = "darkgreen") +
theme_grey(base_size = 14) +
labs(title = "Median household income",
subtitle = "Counties in Maine",
x = "ACS estimate (bars represent margins of error)",
y = "") +
scale_x_continuous(labels = scales::dollar)
Chapter 3 - US Census Geographic Data in R
Understanding census geography and tigris basics:
Customizing tigris options:
Combining and joining census geographic districts:
Plotting data with tigris and ggplot2:
Example code includes:
library(tigris)
# Get a counties dataset for Colorado and plot it
co_counties <- counties(state = "CO")
plot(co_counties)
# Get a Census tracts dataset for Denver County, Colorado and plot it
denver_tracts <- tracts(state = "CO", county = "Denver")
plot(denver_tracts)
# Plot area water features for Lane County, Oregon
lane_water <- area_water(state = "OR", county = "Lane")
plot(lane_water)
# Plot primary & secondary roads for the state of New Hampshire
nh_roads <- primary_secondary_roads(state = "NH")
plot(nh_roads)
# Check the class of the data
class(co_counties)
# Take a look at the information in the data slot
head(co_counties@data)
# Check the coordinate system of the data
co_counties@proj4string
# Get a counties dataset for Michigan
mi_tiger <- counties("MI")
# Get the equivalent cartographic boundary shapefile
mi_cb <- counties("MI", cb = TRUE)
# Overlay the two on a plot to make a comparison
plot(mi_tiger)
plot(mi_cb, add = TRUE, border = "red")
# Get data from tigris as simple features
options(tigris_class = "sf")
# Get countries from Colorado and view the first few rows
colorado_sf <- counties("CO")
head(colorado_sf)
# Plot its geometry column
plot(colorado_sf$geometry)
# DO NOT ADD CACHE FOR NOW
# Set the cache directory
# tigris_cache_dir("Your preferred cache directory path would go here")
# Set the tigris_use_cache option
# options(tigris_use_cache = TRUE)
# Check to see that you've modified the option correctly
# getOption("tigris_use_cache")
# Get a historic Census tract shapefile from 1990 for Williamson County, Texas
williamson90 <- tracts(state = "TX", county = "Williamson",
cb = TRUE, year = 1990)
# Compare with a current dataset for 2016
williamson16 <- tracts(state = "TX", county = "Williamson",
cb = TRUE, year = 2016)
# Plot the geometry to compare the results
par(mfrow = c(1, 2))
plot(williamson90$geometry)
plot(williamson16$geometry)
# Get Census tract boundaries for Oregon and Washington
or_tracts <- tracts("OR", cb = TRUE)
wa_tracts <- tracts("WA", cb = TRUE)
# Check the tigris attributes of each object
attr(or_tracts, "tigris")
attr(wa_tracts, "tigris")
# Combine the datasets then plot the result
or_wa_tracts <- rbind_tigris(or_tracts, wa_tracts)
plot(or_wa_tracts$geometry)
# Generate a vector of state codes and assign to new_england
new_england <- c("ME", "NH", "VT", "MA")
# Iterate through the states and request tract data for state
ne_tracts <- map(new_england, function(x) {
tracts(state = x, cb = TRUE)
}) %>%
rbind_tigris()
plot(ne_tracts$geometry)
# Get boundaries for Texas and set the house parameter
tx_house <- state_legislative_districts(state = "TX", house = "lower", cb = TRUE)
# Merge data on legislators to their corresponding boundaries
tx_joined <- left_join(tx_house, tx_members, by = c("NAME" = "District"))
head(tx_joined)
# Plot the legislative district boundaries
ggplot(tx_joined) +
geom_sf()
# Set fill aesthetic to map areas represented by Republicans and Democrats
ggplot(tx_joined, aes(fill = Party)) +
geom_sf()
# Set values so that Republican areas are red and Democratic areas are blue
ggplot(tx_joined, aes(fill = Party)) +
geom_sf() +
scale_fill_manual(values = c("R" = "red", "D" = "blue"))
# Draw a ggplot without gridlines and with an informative title
ggplot(tx_joined, aes(fill = Party)) +
geom_sf() +
coord_sf(crs = 3083, datum = NA) +
scale_fill_manual(values = c("R" = "red", "D" = "blue")) +
theme_minimal(base_size = 16) +
labs(title = "State House Districts in Texas")
Chapter 4 - Mapping US Census Data
Simple feature geometry and tidycensus:
Mapping demographic data with ggplot2:
Advance demographic mapping:
Cartographic workflows with tigris and tidycensus:
Next steps for working with demographic data in R:
Example code includes:
library(sf)
# Get dataset with geometry set to TRUE
orange_value <- get_acs(geography = "tract", state = "CA", county = "Orange",
variables = "B25077_001", geometry = TRUE
)
# Plot the estimate to view a map of the data
plot(orange_value["estimate"])
# Get an income dataset for Idaho by school district
idaho_income <- get_acs(geography = "school district (unified)",
variables = "B19013_001",
state = "ID")
# Get a school district dataset for Idaho
idaho_school <- school_districts(state = "ID", type = "unified", class = "sf")
# Join the income dataset to the boundaries dataset
id_school_joined <- left_join(idaho_school, idaho_income, by = "GEOID")
plot(id_school_joined["estimate"])
# Get a dataset of median home values from the 1-year ACS
state_value <- get_acs(geography = "state",
variables = "B25077_001",
survey = "acs1",
geometry = TRUE,
shift_geo = TRUE)
# Plot the dataset to view the shifted geometry
plot(state_value["estimate"])
# Create a choropleth map with ggplot
ggplot(marin_value, aes(fill = estimate)) +
geom_sf()
# Set continuous viridis palettes for your map
ggplot(marin_value, aes(fill = estimate, color = estimate)) +
geom_sf() +
scale_fill_viridis_c() +
scale_color_viridis_c()
# Set the color guide to FALSE and add a subtitle and caption to your map
ggplot(marin_value, aes(fill = estimate, color = estimate)) +
geom_sf() +
scale_fill_viridis_c(labels = scales::dollar) +
scale_color_viridis_c(guide = FALSE) +
theme_minimal() +
coord_sf(crs = 26911, datum = NA) +
labs(title = "Median owner-occupied housing value by Census tract",
subtitle = "Marin County, California",
caption = "Data source: 2012-2016 ACS.\nData acquired with the R tidycensus package.",
fill = "ACS estimate")
# Generate point centers
centers <- st_centroid(state_value)
# Set size parameter and the size range
ggplot() +
geom_sf(data = state_value, fill = "white") +
geom_sf(data = centers, aes(size = estimate), shape = 21,
fill = "lightblue", alpha = 0.7, show.legend = "point") +
scale_size_continuous(range = c(1, 20))
# Check the first few rows of the loaded dataset dc_race
head(dc_race)
# Remove the gridlines and generate faceted maps
ggplot(dc_race, aes(fill = percent, color = percent)) +
geom_sf() +
coord_sf(datum = NA) +
facet_wrap(~variable)
# Map the orange_value dataset interactively
m <- mapview(orange_value)
m@map
# Map your data by the estimate column
m <- mapview(orange_value, zcol = "estimate")
m@map
# Add a legend to your map
m <- mapview(orange_value, zcol = "estimate", legend=TRUE)
m@map
# Generate dots, create a group column, and group by group column
dc_dots <- map(c("White", "Black", "Hispanic", "Asian"), function(group) {
dc_race %>%
filter(variable == group) %>%
st_sample(., size = .$value / 100) %>%
st_sf() %>%
mutate(group = group)
}) %>%
reduce(rbind) %>%
group_by(group) %>%
summarize()
# Filter the DC roads object for major roads only
dc_roads <- roads("DC", "District of Columbia") %>%
filter(RTTYP %in% c("I", "S", "U"))
# Get an area water dataset for DC
dc_water <- area_water("DC", "District of Columbia")
# Get the boundary of DC
dc_boundary <- counties("DC", cb = TRUE)
# Plot your datasets and give your map an informative caption
ggplot() +
geom_sf(data = dc_boundary, color = NA, fill = "white") +
geom_sf(data = dc_dots, aes(color = group, fill = group), size = 0.1) +
geom_sf(data = dc_water, color = "lightblue", fill = "lightblue") +
geom_sf(data = dc_roads, color = "grey") +
coord_sf(crs = 26918, datum = NA) +
scale_color_brewer(palette = "Set1", guide = FALSE) +
scale_fill_brewer(palette = "Set1") +
labs(title = "The racial geography of Washington, DC",
subtitle = "2010 decennial U.S. Census",
fill = "",
caption = "1 dot = approximately 100 people.\nData acquired with the R tidycensus and tigris packages.")
Chapter 1 - Reading and Plotting Mutivariate Data
Reading multivariate data:
Mean vector and variance-covariance matrix:
Plotting mutivariate data:
Example code includes:
# Read in the wine dataset
wine <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", sep = ",")
# Print the first four entries
head(wine, n=4)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14
## 1 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065
## 2 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
## 3 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185
## 4 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480
# Find the dimensions of the data
dim(wine)
## [1] 178 14
# Check the names of the wine dataset
names(wine)
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V11"
## [12] "V12" "V13" "V14"
# Assign new names
names(wine) <- c('Type', 'Alcohol', 'Malic', 'Ash', 'Alcalinity', 'Magnesium', 'Phenols', 'Flavanoids', 'Nonflavanoids','Proanthocyanins', 'Color', 'Hue', 'Dilution', 'Proline')
# Check the new column names
names(wine)
## [1] "Type" "Alcohol" "Malic"
## [4] "Ash" "Alcalinity" "Magnesium"
## [7] "Phenols" "Flavanoids" "Nonflavanoids"
## [10] "Proanthocyanins" "Color" "Hue"
## [13] "Dilution" "Proline"
# Check data type of each variable
str(wine)
## 'data.frame': 178 obs. of 14 variables:
## $ Type : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Alcalinity : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoids : num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins: num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ Dilution : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
# Change the Type variable data type
wine$Type <- as.factor(wine$Type)
# Check data type again
str(wine)
## 'data.frame': 178 obs. of 14 variables:
## $ Type : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Alcalinity : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoids : num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins: num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ Dilution : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
# Calculate the mean of the Alcohol, Malic, Ash, and Alcalinity variables
colMeans(wine[, 2:5])
## Alcohol Malic Ash Alcalinity
## 13.000618 2.336348 2.366517 19.494944
# Calculate the mean of the variables by wine type
by(wine[, 2:5], wine$Type, FUN=colMeans)
## wine$Type: 1
## Alcohol Malic Ash Alcalinity
## 13.744746 2.010678 2.455593 17.037288
## --------------------------------------------------------
## wine$Type: 2
## Alcohol Malic Ash Alcalinity
## 12.278732 1.932676 2.244789 20.238028
## --------------------------------------------------------
## wine$Type: 3
## Alcohol Malic Ash Alcalinity
## 13.153750 3.333750 2.437083 21.416667
# Calculate the variance-covariance matrix of the variables Alcohol, Malic, Ash, Alcalinity
var.wine <- var(wine[, 2:5])
# Round the matrix values to two decimal places
round(var.wine, 2)
## Alcohol Malic Ash Alcalinity
## Alcohol 0.66 0.09 0.05 -0.84
## Malic 0.09 1.25 0.05 1.08
## Ash 0.05 0.05 0.08 0.41
## Alcalinity -0.84 1.08 0.41 11.15
# Calculate the covariance matrix
cor.wine <- cor(wine[, 2:5])
# Round the matrix to two decimal places
round(cor.wine, 2)
## Alcohol Malic Ash Alcalinity
## Alcohol 1.00 0.09 0.21 -0.31
## Malic 0.09 1.00 0.16 0.29
## Ash 0.21 0.16 1.00 0.44
## Alcalinity -0.31 0.29 0.44 1.00
# Plot the correlations
corrplot::corrplot(cor.wine, method = "ellipse")
# Scatter plot matrix with base R
pairs(wine[, 2:5])
# Scatter plot matrix with lattice
lattice::splom(~wine[, 2:5])
# Scatter plot matrix colored by groups
lattice::splom( ~ wine[2:5], pch = 16, col=wine$Type)
# Produce a matrix of plots for the first four variables
wine.gg <- GGally::ggpairs(wine[, 2:5])
wine.gg
# Produce a matrix of plots for the first four variables
wine.gg <- GGally::ggpairs(wine, columns=2:5)
wine.gg
# Color the points by wine type
wine.gg <- GGally::ggpairs(data = wine, columns = 2:5, aes(color=Type))
wine.gg
# Plot the three variables
scatterplot3d::scatterplot3d(wine[, c("Alcohol", "Malic", "Alcalinity")], color=wine$Type)
Chapter 2 - Multivariate Normal Distribution
Multivariate Normal Distribution:
Density of a multivariate normal distribution:
Cumulative distribution and inverse CDF:
Checking normality of multivariate data:
Example code includes:
mu.sim <- c(2, -2)
sigma.sim <- matrix(data=c(9, 5, 5, 4), nrow=2, byrow=FALSE)
mu.sim
## [1] 2 -2
sigma.sim
## [,1] [,2]
## [1,] 9 5
## [2,] 5 4
# Generate 100 bivariate normal samples
multnorm.sample <- mvtnorm::rmvnorm(100, mean=mu.sim, sigma=sigma.sim)
# View the first 6 samples
head(multnorm.sample)
## [,1] [,2]
## [1,] -1.5479933 -3.650487
## [2,] 0.2534189 -2.858655
## [3,] 1.8644954 -2.708265
## [4,] 0.2055723 -2.143091
## [5,] 0.3575927 -4.418678
## [6,] 1.0176200 -3.387277
# Scatterplot of the bivariate samples
plot(multnorm.sample)
# Calculate density
multnorm.dens <- mvtnorm::dmvnorm(multnorm.sample, mean = mu.sim, sigma = sigma.sim)
# Create scatter plot of density heights
scatterplot3d::scatterplot3d(cbind(multnorm.sample, multnorm.dens), color="blue", pch="",
type = "h", xlab = "x", ylab = "y", zlab = "density"
)
mvals <- expand.grid(seq(-5, 10, length.out = 40), seq(-8, 4, length.out = 40))
str(mvals)
## 'data.frame': 1600 obs. of 2 variables:
## $ Var1: num -5 -4.62 -4.23 -3.85 -3.46 ...
## $ Var2: num -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 ...
## - attr(*, "out.attrs")=List of 2
## ..$ dim : int 40 40
## ..$ dimnames:List of 2
## .. ..$ Var1: chr "Var1=-5.0000000" "Var1=-4.6153846" "Var1=-4.2307692" "Var1=-3.8461538" ...
## .. ..$ Var2: chr "Var2=-8.0000000" "Var2=-7.6923077" "Var2=-7.3846154" "Var2=-7.0769231" ...
# Calculate density over the specified grid
mvds <- mvtnorm::dmvnorm(mvals, mean=mu.sim, sigma=sigma.sim)
matrix_mvds <- matrix(mvds, nrow = 40)
# Create a perspective plot
persp(matrix_mvds, theta = 80, phi = 30, expand = 0.6, shade = 0.2,
col = "lightblue", xlab = "x", ylab = "y", zlab = "dens"
)
# Volume under a bivariate standard normal
mvtnorm::pmvnorm(lower = c(-1, -1), upper = c(1, 1))
## [1] 0.4660649
## attr(,"error")
## [1] 1e-15
## attr(,"msg")
## [1] "Normal Completion"
# Volume under specified mean and variance-covariance matrix
mvtnorm::pmvnorm(lower = c(-5, -5), upper = c(5, 5), mean = mu.sim, sigma = sigma.sim)
## [1] 0.7734162
## attr(,"error")
## [1] 1e-15
## attr(,"msg")
## [1] "Normal Completion"
# Probability contours for a standard bivariate normal
mvtnorm::qmvnorm(p=0.9, tail = "both", sigma = diag(2))
## $quantile
## [1] 1.948779
##
## $f.quantile
## [1] -1.537507e-06
##
## attr(,"message")
## [1] "Normal Completion"
# Probability contours for a bivariate normal
mvtnorm::qmvnorm(p=0.95, tail = "both", mean=mu.sim, sigma=sigma.sim)
## $quantile
## [1] 7.110635
##
## $f.quantile
## [1] 5.712626e-06
##
## attr(,"message")
## [1] "Normal Completion"
# Test sample normality
qqnorm(multnorm.sample[, 1])
qqline(multnorm.sample[, 1])
# requires RJAGS 4+
# Create qqnorm plot (no longer exported from MVN)
# MVN::uniPlot(wine[, c("Alcohol", "Malic", "Ash", "Alcalinity")], type = "qqplot")
# MVN::mvn(wine[, c("Alcohol", "Malic", "Ash", "Alcalinity")], univariatePlot = "qq")
# requires RJAGS 4+
# mardiaTest qqplot
# wine.mvntest <- MVN::mardiaTest(wine[, 2:5]) # 'MVN::mardiaTest' is deprecated.\nUse 'mvn' instead.\nSee help(\"Deprecated\")
# wine.mvntest <- MVN::mvn(wine[, 2:5])
# wine.mvntest
# requires RJAGS 4+
# Use mardiaTest
# MVN::mvn(multnorm.sample)
# requires RJAGS 4+
# Use hzTest
# MVN::hzTest(wine[, 2:5]) # 'MVN::hzTest' is deprecated. Use 'mvn' instead.
# MVN::mvn(wine[, 2:5], mvnTest="hz")
Chapter 3 - Other Multivariate Distributions
Other common multivariate distributions:
Density and cumulative density for mutlivariate-T:
Multivariate skewed distributions:
Example code includes:
# Generate the t-samples
multt.sample <- mvtnorm::rmvt(200, delta=mu.sim, sigma=sigma.sim, df=5)
# Print the first 6 samples
head(multt.sample)
## [,1] [,2]
## [1,] 1.5698586720 -2.3958159
## [2,] -7.8935083134 -8.5106908
## [3,] -0.0009421534 -4.4741769
## [4,] 6.8646754087 -0.4493624
## [5,] 3.1696218853 -0.9010101
## [6,] 1.5311387470 -2.6552099
# Requires RJAGS 4+
# Check multivariate normality
# MVN::mvn(multt.sample, univariatePlot="qq", mvnTest="mardia")
# Calculate densities
multt.dens <- mvtnorm::dmvt(multt.sample, delta=mu.sim, sigma=sigma.sim, df=5)
# Plot 3D heights of densities
scatterplot3d::scatterplot3d(cbind(multt.sample, multt.dens), color = "blue", pch = "",
type = "h", xlab = "x", ylab = "y", zlab = "density"
)
# Calculate the volume under the specified t-distribution
mvtnorm::pmvt(lower = c(-5, -5), upper = c(5, 5), delta=mu.sim, sigma=sigma.sim, df=5)
## [1] 0.6627531
## attr(,"error")
## [1] 0.0006930966
## attr(,"msg")
## [1] "Normal Completion"
# Calculate the equal probability contour
mvtnorm::qmvt(p=0.9, tail="both", delta=0, sigma=diag(2), df=5)
## $quantile
## [1] 2.490408
##
## $f.quantile
## [1] -2.150358e-07
##
## attr(,"message")
## [1] "Normal Completion"
# Generate the skew-normal samples
skewnorm.sample <- sn::rmsn(n=100, xi=mu.sim, Omega=sigma.sim, alpha=c(4, -4))
# Print first six samples
head(skewnorm.sample)
## [,1] [,2]
## [1,] 5.378809 -0.1682481
## [2,] 2.294349 -3.7606670
## [3,] 1.898728 -3.7117472
## [4,] 3.753504 -2.2154888
## [5,] 8.020180 0.6697977
## [6,] 2.902386 -1.8605283
# Generate the skew-t samples
skewt.sample <- sn::rmst(n = 100, xi = mu.sim, Omega = sigma.sim, alpha = c(4, -4), nu=5)
# Print first six samples
head(skewt.sample)
## [,1] [,2]
## [1,] 2.7731981 -1.7126980
## [2,] 2.8562516 -2.2158996
## [3,] 6.8955406 0.8405366
## [4,] -4.0210404 -6.0080851
## [5,] -1.9499787 -4.7405617
## [6,] -0.1967472 -3.3460686
skewnorm.sampleDF <- data.frame(x=skewnorm.sample[, 1], y=skewnorm.sample[, 2])
str(skewnorm.sampleDF)
## 'data.frame': 100 obs. of 2 variables:
## $ x: num 5.38 2.29 1.9 3.75 8.02 ...
## $ y: num -0.168 -3.761 -3.712 -2.215 0.67 ...
# Contour plot for skew-normal sample
ggplot(skewnorm.sampleDF, aes(x=x, y=y)) +
geom_point() +
geom_density_2d()
# Requires RJAGS 4+
# Normality test for skew-normal sample
# skewnorm.Test <- MVN::mvn(skewnorm.sample, mvnTest="mardia", univariatePlot="qq")
# Requires RJAGS 4+
# Normality test for skew-t sample
# skewt.Test <- MVN::mvn(skewt.sample, mvnTest="mardia", univariatePlot="qq")
ais.female <- data.frame(Ht=c(195.9, 189.7, 177.8, 185, 184.6, 174, 186.2, 173.8, 171.4, 179.9, 193.4, 188.7, 169.1, 177.9, 177.5, 179.6, 181.3, 179.7, 185.2, 177.3, 179.3, 175.3, 174, 183.3, 184.7, 180.2, 180.2, 176, 156, 179.7, 180.9, 179.5, 178.9, 182.1, 186.3, 176.8, 172.6, 176, 169.9, 183, 178.2, 177.3, 174.1, 173.6, 173.7, 178.7, 183.3, 174.4, 173.3, 168.6, 174, 176, 172.2, 182.7, 180.5, 179.8, 179.6, 171.7, 170, 170, 180.5, 173.3, 173.5, 181, 175, 170.3, 165, 169.8, 174.1, 175, 171.1, 172.7, 175.6, 171.6, 172.3, 171.4, 178, 162, 167.3, 162, 170.8, 163, 166.1, 176, 163.9, 173, 177, 168, 172, 167.9, 177.5, 162.5, 172.5, 166.7, 175, 157.9, 158.9, 156.9, 148.9, 149),
Wt=c(78.9, 74.4, 69.1, 74.9, 64.6, 63.7, 75.2, 62.3, 66.5, 62.9, 96.3, 75.5, 63, 80.5, 71.3, 70.5, 73.2, 68.7, 80.5, 72.9, 74.5, 75.4, 69.5, 66.4, 79.7, 73.6, 78.7, 75, 49.8, 67.2, 66, 74.3, 78.1, 79.5, 78.5, 59.9, 63, 66.3, 60.7, 72.9, 67.9, 67.5, 74.1, 68.2, 68.8, 75.3, 67.4, 70, 74, 51.9, 74.1, 74.3, 77.8, 66.9, 83.8, 82.9, 64.1, 68.85, 64.8, 59, 72.1, 75.6, 71.4, 69.7, 63.9, 55.1, 60, 58, 64.7, 87.5, 78.9, 83.9, 82.8, 74.4, 94.8, 49.2, 61.9, 53.6, 63.7, 52.8, 65.2, 50.9, 57.3, 60, 60.1, 52.5, 59.7, 57.3, 59.6, 71.5, 69.7, 56.1, 61.1, 47.4, 56, 45.8, 47.8, 43.8, 37.8, 45.1)
)
str(ais.female)
## 'data.frame': 100 obs. of 2 variables:
## $ Ht: num 196 190 178 185 185 ...
## $ Wt: num 78.9 74.4 69.1 74.9 64.6 63.7 75.2 62.3 66.5 62.9 ...
# Fit skew-normal parameters
fit.ais <- sn::msn.mle(y = cbind(ais.female$Ht, ais.female$Wt), opt.method = "BFGS")
# Print the skewness parameters
fit.ais$dp$alpha
## [1] -1.292446 -1.000158
# Fit skew-normal parameters
fit.ais <- sn::msn.mle(y = ais.female[, c("Ht", "Wt")])
# Print the skewness parameters
fit.ais$dp$alpha
## Ht Wt
## -1.292450 -1.000187
Chapter 4 - Principal Component Analysis and Multidimensional Scaling
Principal Component Analysis:
Choosing the number of components:
Interpreting PCA attributes:
Multi-dimensional scaling:
Wrap-Up:
Example code includes:
data(state)
str(state.x77)
## num [1:50, 1:8] 3615 365 2212 2110 21198 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## ..$ : chr [1:8] "Population" "Income" "Illiteracy" "Life Exp" ...
par(mfrow=c(1, 1))
par(mfcol=c(1, 1))
# Calculate PCs
pca.state <- princomp(state.x77, cor=TRUE, scores=TRUE)
# Plot the PCA object
plot(pca.state)
# Print the summary of the PCs
summary(pca.state)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.8970755 1.2774659 1.0544862 0.84113269 0.62019488
## Proportion of Variance 0.4498619 0.2039899 0.1389926 0.08843803 0.04808021
## Cumulative Proportion 0.4498619 0.6538519 0.7928445 0.88128252 0.92936273
## Comp.6 Comp.7 Comp.8
## Standard deviation 0.55449226 0.3800642 0.33643379
## Proportion of Variance 0.03843271 0.0180561 0.01414846
## Cumulative Proportion 0.96779544 0.9858515 1.00000000
# Variance explained by each PC
pca.var <- pca.state$sdev^2
# Proportion of variance explained by each PC
pca.pvar <- pca.var/sum(pca.var)
# Proportion of variance explained by each principal component
pca.pvar
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## 0.44986195 0.20398990 0.13899264 0.08843803 0.04808021 0.03843271 0.01805610
## Comp.8
## 0.01414846
# Cumulative variance explained plot
plot(cumsum(pca.pvar), xlab = "Principal component",
ylab = "Cumulative Proportion of variance explained", ylim = c(0,1), type = 'b')
grid()
# Add a horizontal line
abline(h=0.95, col="blue")
# Draw screeplot
screeplot(pca.state, type = "l")
grid()
# Create dataframe of scores
scores.state <- data.frame(pca.state$scores)
# Plot of scores labeled by state name
ggplot(data = scores.state, aes(x = Comp.1, y = Comp.2, label = rownames(scores.state))) +
geom_text( alpha = 0.8, size = 3) +
ggtitle("PCA of states data")
# Create dataframe of scores
scores.state <- data.frame(pca.state$scores)
# Plot of scores colored by region
ggplot(data=scores.state, aes(x=Comp.1, y=Comp.2, label=rownames(scores.state), color=state.region)) +
geom_text(alpha = 0.8, size = 3) +
ggtitle("PCA of states data colored by region")
# Plot the scores
factoextra::fviz_pca_ind(pca.state)
# Plot the PC loadings
factoextra::fviz_pca_var(pca.state)
# Create a biplot
factoextra::fviz_pca_biplot(pca.state)
# Calculate distance
state.dist <- dist(state.x77)
# Perform multidimensional scaling
mds.state <- cmdscale(state.dist)
mds.state_df <- data.frame(mds.state)
# Plot the representation of the data in two dimensions
ggplot(data = mds.state_df, aes(x = X1, y = X2, label = rownames(mds.state), color = state.region)) +
geom_text(alpha = 0.8, size = 3)
# Calculate distance
wine.dist <- dist(wine[, -1])
# Perform multidimensional scaling
mds.wine <- cmdscale(wine.dist, k=3)
mds.wine_df <- data.frame(mds.wine)
# Plot the representation of the data in three dimensions
scatterplot3d::scatterplot3d(mds.wine_df, color = wine$Type, pch = 19, type = "h", lty.hplot = 2)
Chapter 1 - Programming with purrr
Refresher of purrr Basics:
Introduction to mappers:
Using Mappers to Clean Data:
Predicates:
Example code includes:
visit_a <- c(117, 147, 131, 73, 81, 134, 121)
visit_b <- c(180, 193, 116, 166, 131, 153, 146)
visit_c <- c(57, 110, 68, 72, 87, 141, 67)
# Create the to_day function
to_day <- function(x) {
x*24
}
# Create a list containing both vectors: all_visits
all_visits <- list(visit_a, visit_b)
# Convert to daily number of visits: all_visits_day
all_visits_day <- map(all_visits, to_day)
# Map the mean() function and output a numeric vector
map_dbl(all_visits_day, mean)
## [1] 2756.571 3720.000
# You'll test out both map() and walk() for plotting
# Both return the "side effects," that is to say, the changes in the environment (drawing plots, downloading a file, changing the working directory...), but walk() won't print anything to the console.
# Create all_tests list and modify with to_day() function
all_tests <- list(visit_a, visit_b, visit_c)
all_tests_day <- map(all_tests, to_day)
# Plot all_tests_day with map
map(all_tests_day, barplot)
## [[1]]
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
## [5,] 5.5
## [6,] 6.7
## [7,] 7.9
##
## [[2]]
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
## [5,] 5.5
## [6,] 6.7
## [7,] 7.9
##
## [[3]]
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
## [5,] 5.5
## [6,] 6.7
## [7,] 7.9
# Plot all_tests_day
walk(all_tests_day, barplot)
# Get sum of all visits and class of sum_all
sum_all <- pmap(all_tests_day, sum)
class(sum_all)
## [1] "list"
# Turn visit_a into daily number using an anonymous function
map(visit_a, function(x) { x*24 })
## [[1]]
## [1] 2808
##
## [[2]]
## [1] 3528
##
## [[3]]
## [1] 3144
##
## [[4]]
## [1] 1752
##
## [[5]]
## [1] 1944
##
## [[6]]
## [1] 3216
##
## [[7]]
## [1] 2904
# Turn visit_a into daily number of visits by using a mapper
map(visit_a, ~.x*24)
## [[1]]
## [1] 2808
##
## [[2]]
## [1] 3528
##
## [[3]]
## [1] 3144
##
## [[4]]
## [1] 1752
##
## [[5]]
## [1] 1944
##
## [[6]]
## [1] 3216
##
## [[7]]
## [1] 2904
# Create a mapper object called to_day
to_day <- as_mapper(~.x*24)
# Use it on the three vectors
map(visit_a, to_day)
## [[1]]
## [1] 2808
##
## [[2]]
## [1] 3528
##
## [[3]]
## [1] 3144
##
## [[4]]
## [1] 1752
##
## [[5]]
## [1] 1944
##
## [[6]]
## [1] 3216
##
## [[7]]
## [1] 2904
map(visit_b, to_day)
## [[1]]
## [1] 4320
##
## [[2]]
## [1] 4632
##
## [[3]]
## [1] 2784
##
## [[4]]
## [1] 3984
##
## [[5]]
## [1] 3144
##
## [[6]]
## [1] 3672
##
## [[7]]
## [1] 3504
map(visit_c, to_day)
## [[1]]
## [1] 1368
##
## [[2]]
## [1] 2640
##
## [[3]]
## [1] 1632
##
## [[4]]
## [1] 1728
##
## [[5]]
## [1] 2088
##
## [[6]]
## [1] 3384
##
## [[7]]
## [1] 1608
# Round visit_a to the nearest tenth with a mapper
map_dbl(visit_a, ~ round(.x, -1))
## [1] 120 150 130 70 80 130 120
# Create to_ten, a mapper that rounds to the nearest tenth
to_ten <- as_mapper(~ round(.x, -1))
# Map to_ten on visit_b
map_dbl(visit_b, to_ten)
## [1] 180 190 120 170 130 150 150
# Map to_ten on visit_c
map_dbl(visit_c, to_ten)
## [1] 60 110 70 70 90 140 70
# Create a mapper that test if .x is more than 100
is_more_than_hundred <- as_mapper(~ .x > 100)
# Run this mapper on the all_visits object
map(all_visits, ~ keep(.x, is_more_than_hundred) )
## [[1]]
## [1] 117 147 131 134 121
##
## [[2]]
## [1] 180 193 116 166 131 153 146
# Use the day vector to set names to all_list
day <- c("mon", "tue", "wed", "thu", "fri", "sat", "sun")
full_visits_named <- map(all_visits, ~ set_names(.x, day))
# Use this mapper with keep()
map(full_visits_named, ~ keep(.x, is_more_than_hundred))
## [[1]]
## mon tue wed sat sun
## 117 147 131 134 121
##
## [[2]]
## mon tue wed thu fri sat sun
## 180 193 116 166 131 153 146
# Set the name of each subvector
day <- c("mon", "tue", "wed", "thu", "fri", "sat", "sun")
all_visits_named <- map(all_visits, ~ set_names(.x, day))
# Create a mapper that will test if .x is over 100
threshold <- as_mapper(~.x > 100)
# Run this mapper on the all_visits object: group_over
group_over <- map(all_visits, ~ keep(.x, threshold) )
# Run this mapper on the all_visits object: group_under
group_under <- map(all_visits, ~ discard(.x, threshold) )
# Create a threshold variable, set it to 160
threshold <- 160
# Create a mapper that tests if .x is over the defined threshold
over_threshold <- as_mapper(~ .x > threshold)
# Are all elements in every all_visits vectors over the defined threshold?
map(all_visits, ~ every(.x, over_threshold))
## [[1]]
## [1] FALSE
##
## [[2]]
## [1] FALSE
# Are some elements in every all_visits vectors over the defined threshold?
map(all_visits, ~ some(.x, over_threshold))
## [[1]]
## [1] FALSE
##
## [[2]]
## [1] TRUE
Chapter 2 - Functional Programming from Theory to Practice
Functional Programming in R:
Tools for Functional Programming in purrr:
function(...){ fun(..., na.rm = TRUE) } Using possibly():
Handling adverb results:
Example code includes:
# `$` is a function call, of a special type called 'infix operator', as they are put between two elements, and can be used without parenthesis.
# Launch Sys.time(), Sys.sleep(1), & Sys.time()
Sys.time()
## [1] "2019-03-05 08:26:47 CST"
Sys.sleep(1)
Sys.time()
## [1] "2019-03-05 08:26:48 CST"
data(iris)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
# Launch nrow(iris), Sys.sleep(1), & nrow(iris)
nrow(iris)
## [1] 150
Sys.sleep(1)
nrow(iris)
## [1] 150
# Launch ls(), create an object, then rerun the ls() function
# ls()
# this <- 12
# ls()
# Create a plot of the iris dataset
plot(iris)
urls <- c('https://thinkr.fr', 'https://colinfay.me', 'http://not_working.org', 'https://datacamp.com', 'http://cran.r-project.org/', 'https://not_working_either.org')
# Create a safe version of read_lines()
safe_read <- safely(readr::read_lines)
# Map it on the urls vector
res <- map(urls, safe_read)
# Set the name of the results to `urls`
named_res <- set_names(res, urls)
# Extract only the "error" part of each sublist
map(named_res, "error")
## $`https://thinkr.fr`
## NULL
##
## $`https://colinfay.me`
## NULL
##
## $`http://not_working.org`
## <simpleError in open.connection(con, "rb"): Could not resolve host: not_working.org>
##
## $`https://datacamp.com`
## NULL
##
## $`http://cran.r-project.org/`
## NULL
##
## $`https://not_working_either.org`
## <simpleError in open.connection(con, "rb"): Could not resolve host: not_working_either.org>
# Code a function that discard() the NULL from safe_read()
safe_read_discard <- function(url){
safe_read(url) %>%
discard(is.null)
}
# Map this function on the url list
res <- map(urls, safe_read_discard)
# Create a possibly() version of read_lines()
possible_read <- possibly(readr::read_lines, otherwise = 404)
# Map this function on urls, pipe it into set_names()
res <- map(urls, possible_read) %>% set_names(urls)
# Paste each element of the list
res_pasted <- map(res, paste, collapse=" ")
# Keep only the elements which are equal to 404
keep(res_pasted, ~ .x == 404)
## $`http://not_working.org`
## [1] "404"
##
## $`https://not_working_either.org`
## [1] "404"
url_tester <- function(url_list){
url_list %>%
# Map a version of read_lines() that otherwise returns 404
map( possibly(readr::read_lines, otherwise = 404) ) %>%
# Set the names of the result
set_names( urls ) %>%
# paste() and collapse each element
map(paste, collapse =" ") %>%
# Remove the 404
discard(~.x==404) %>%
names() # Will return the names of the good ones
}
# Try this function on the urls object
url_tester(urls)
## [1] "https://thinkr.fr" "https://colinfay.me"
## [3] "https://datacamp.com" "http://cran.r-project.org/"
url_tester <- function(url_list, type = c("result", "error")){
res <- url_list %>%
# Create a safely() version of read_lines()
map( safely(readr::read_lines) ) %>%
set_names( url_list ) %>%
# Transpose into a list of $result and $error
transpose()
# Complete this if statement
if (type == "result") return( res$result )
if (type == "error") return( res$error )
}
# Try this function on the urls object
url_tester(urls, type = "error")
## $`https://thinkr.fr`
## NULL
##
## $`https://colinfay.me`
## NULL
##
## $`http://not_working.org`
## <simpleError in open.connection(con, "rb"): Could not resolve host: not_working.org>
##
## $`https://datacamp.com`
## NULL
##
## $`http://cran.r-project.org/`
## NULL
##
## $`https://not_working_either.org`
## <simpleError in open.connection(con, "rb"): Could not resolve host: not_working_either.org>
url_tester <- function(url_list){
url_list %>%
# Map a version of GET() that would otherwise return NULL
map( possibly(httr::GET, otherwise=NULL) ) %>%
# Set the names of the result
set_names( urls ) %>%
# Remove the NULL
compact() %>%
# Extract all the "status_code" elements
map("status_code")
}
# Try this function on the urls object
url_tester(urls)
## $`https://thinkr.fr`
## [1] 200
##
## $`https://colinfay.me`
## [1] 200
##
## $`https://datacamp.com`
## [1] 200
##
## $`http://cran.r-project.org/`
## [1] 200
Chapter 3 - Better Code with purrr
Rationale for cleaner code:
Building functions with compose() and negate():
Prefilling functions:
List columns:
Example code includes:
urls <- c('https://thinkr.fr', 'https://colinfay.me', 'http://not_working.org', 'https://datacamp.com', 'http://cran.r-project.org/', 'https://not_working_either.org')
# Compose a status extractor (compose is also an igraph function)
status_extract <- purrr::compose(httr::status_code, httr::GET)
# Try with "https://thinkr.fr" & "http://datacamp.com"
status_extract("https://thinkr.fr")
## [1] 200
status_extract("http://datacamp.com")
## [1] 200
# Map it on the urls vector, return a vector of numbers
oldUrls <- urls
urls <- oldUrls[c(1, 2, 4, 5)]
map_dbl(urls, status_extract)
## [1] 200 200 200 200
# Negate the %in% function
`%not_in%` <- negate(`%in%`)
# Compose a status extractor
status_extract <- purrr::compose(httr::status_code, httr::GET)
# Complete the function
strict_code <- function(url){
code <- status_extract(url)
if (code %not_in% c(200:203)){ return(NA) } else { return(code) }
}
# Map the strict_code function on the urls vector
res <- map(urls, strict_code)
# Set the names of the results using the urls vector
res_named <- set_names(res, urls)
# Negate the is.na function
is_not_na <- negate(is.na)
# Run is_not_na on the results
is_not_na(res_named)
## https://thinkr.fr https://colinfay.me
## TRUE TRUE
## https://datacamp.com http://cran.r-project.org/
## TRUE TRUE
# Prefill html_nodes() with the css param set to h2
get_h2 <- partial(rvest::html_nodes, css="h2")
# Combine the html_text, get_h2 and read_html functions
get_content <- purrr::compose(rvest::html_text, get_h2, xml2::read_html)
# Map get_content to the urls list
res <- map(urls, get_content) %>%
set_names(urls)
# Print the results to the console
res
## $`https://thinkr.fr`
## [1] "Conseil, développement et formation au logiciel R"
## [2] "Formez-vous au logiciel R !"
## [3] "\r\n\t\tPédagogie de la formation au langage R\r\n\t"
## [4] "\r\n\t\tRetour sur les projets R des étudiants du MSc X-HEC Data Science for Business\r\n\t"
## [5] "\r\n\t\tConstruisons la certification R du RConsortium\r\n\t"
## [6] "\r\n\t\tLes tests statistiques\r\n\t"
## [7] "\r\n\t\tÀ la découverte de RStudio Package Manager\r\n\t"
## [8] "\r\n\t\tLes pièges de la représentation de données\r\n\t"
## [9] "\r\n\t\tDBI : Distributeur des Brasseurs Indépendants ? Non DataBase Interface\r\n\t"
## [10] "\r\n\t\tQuoi de neuf {ggplot2} ?\r\n\t"
## [11] "\r\n\t\tComparaison entre Excel et R : Analyses statistiques et graphiques\r\n\t"
##
## $`https://colinfay.me`
## [1] "\n \n Watch if R is running from Shiny\n\n \n "
## [2] "\n \n An Introduction to Docker for R Users\n\n \n "
## [3] "\n \n 2018 through {cranlogs}\n\n \n "
## [4] "\n \n Solving #AdventOfCode day 5 and 6 with R\n\n \n "
## [5] "\n \n Solving #AdventOfCode day 3 and 4 with R\n\n \n "
##
## $`https://datacamp.com`
## character(0)
##
## $`http://cran.r-project.org/`
## character(0)
# Create a partial version of html_nodes(), with the css param set to "a"
a_node <- partial(rvest::html_nodes, css="a")
# Create href(), a partial version of html_attr()
href <- partial(rvest::html_attr, name = "href")
# Combine href(), a_node(), and read_html()
get_links <- purrr::compose(href, a_node, xml2::read_html)
# Map get_links() to the urls list
res <- map(urls, get_links) %>%
set_names(urls)
df <- tibble::tibble(urls=urls)
df
## # A tibble: 4 x 1
## urls
## <chr>
## 1 https://thinkr.fr
## 2 https://colinfay.me
## 3 https://datacamp.com
## 4 http://cran.r-project.org/
# Create a "links" columns, by mapping get_links() on urls
df2 <- df %>%
mutate(links = map(urls, get_links))
# Print df2 to see what it looks like
df2
## # A tibble: 4 x 2
## urls links
## <chr> <list>
## 1 https://thinkr.fr <chr [147]>
## 2 https://colinfay.me <chr [33]>
## 3 https://datacamp.com <chr [92]>
## 4 http://cran.r-project.org/ <chr [1]>
# unnest() df2 to have a tidy dataframe
df2 %>%
tidyr::unnest()
## # A tibble: 273 x 2
## urls links
## <chr> <chr>
## 1 https://thinkr~ https://thinkr.fr/
## 2 https://thinkr~ https://thinkr.fr/
## 3 https://thinkr~ https://thinkr.fr/formation-au-logiciel-r/
## 4 https://thinkr~ https://thinkr.fr/formation-au-logiciel-r/introduction~
## 5 https://thinkr~ https://thinkr.fr/formation-au-logiciel-r/statistique-~
## 6 https://thinkr~ https://thinkr.fr/formation-au-logiciel-r/programmatio~
## 7 https://thinkr~ https://thinkr.fr/formation-au-logiciel-r/r-et-le-big-~
## 8 https://thinkr~ https://thinkr.fr/formation-au-logiciel-r/r-pour-la-fi~
## 9 https://thinkr~ https://thinkr.fr/formation-au-logiciel-r/integration-~
## 10 https://thinkr~ https://thinkr.fr/formation-au-logiciel-r/formation-si~
## # ... with 263 more rows
Chapter 4 - Case Study
Discovering the Dataset:
Extracting Information from the Dataset:
Manipulating URL:
Identifying Influencers:
Wrap up:
Example code includes:
rstudioconfDF <- readRDS("./RInputFiles/#RStudioConf.RDS")
dim(rstudioconfDF)
## [1] 5055 42
rstudioconf <- as.list(as.data.frame(t(rstudioconfDF)))
length(rstudioconf)
## [1] 5055
length(rstudioconf[[1]])
## [1] 42
# Print the first element of the list to the console
rstudioconf[[1]]
## $status_id
## [1] "960732355773239296"
##
## $created_at
## [1] 1517891417
##
## $user_id
## [1] "626266741"
##
## $screen_name
## [1] "grod_rf"
##
## $text
## [1] "RT @dataandme: <U+0001F41C> Check it, @ajmcoqui's \"Debugging in RStudio\" \n<U+0001F4FD> Slides *and* cheat sheet!\nhttps://t.co/rAvKP9iXLa #rstats #rstudioconf htt…"
##
## $source
## [1] "Twitter for Android"
##
## $reply_to_status_id
## [1] NA
##
## $reply_to_user_id
## [1] NA
##
## $reply_to_screen_name
## [1] NA
##
## $is_quote
## [1] FALSE
##
## $is_retweet
## [1] TRUE
##
## $favorite_count
## [1] 0
##
## $retweet_count
## [1] 7
##
## $hashtags
## [1] "rstats" "rstudioconf"
##
## $symbols
## [1] NA
##
## $urls_url
## [1] "buff.ly/2s7W8ED"
##
## $urls_t.co
## [1] "https://t.co/rAvKP9iXLa"
##
## $urls_expanded_url
## [1] "https://buff.ly/2s7W8ED"
##
## $media_url
## [1] NA
##
## $media_t.co
## [1] NA
##
## $media_expanded_url
## [1] NA
##
## $media_type
## [1] NA
##
## $ext_media_url
## [1] NA
##
## $ext_media_t.co
## [1] NA
##
## $ext_media_expanded_url
## [1] NA
##
## $ext_media_type
## [1] NA
##
## $mentions_user_id
## [1] "3230388598" "732925397814247426"
##
## $mentions_screen_name
## [1] "dataandme" "ajmcoqui"
##
## $lang
## [1] "en"
##
## $quoted_status_id
## [1] NA
##
## $quoted_text
## [1] NA
##
## $retweet_status_id
## [1] "960600422556880896"
##
## $retweet_text
## [1] "<U+0001F41C> Check it, @ajmcoqui's \"Debugging in RStudio\" \n<U+0001F4FD> Slides *and* cheat sheet!\nhttps://t.co/rAvKP9iXLa #rstats #rstudioconf https://t.co/T4627GcuXK"
##
## $place_url
## [1] NA
##
## $place_name
## [1] NA
##
## $place_full_name
## [1] NA
##
## $place_type
## [1] NA
##
## $country
## [1] NA
##
## $country_code
## [1] NA
##
## $geo_coords
## [1] NA NA
##
## $coords_coords
## [1] NA NA
##
## $bbox_coords
## [1] NA NA NA NA NA NA NA NA
# Create a sublist of non-retweets
non_rt <- discard(rstudioconf, "is_retweet")
# Extract the favorite count element of each non_rt sublist
fav_count <- map_dbl(non_rt, "favorite_count")
# Get the median of favorite_count for non_rt
median(fav_count)
## [1] 1
# Keep the RT, extract the user_id, remove the duplicate
rt <- keep(rstudioconf, "is_retweet") %>%
map("user_id") %>%
unique()
# Remove the RT, extract the user id, remove the duplicate
non_rt <- discard(rstudioconf, "is_retweet") %>%
map("user_id") %>%
unique()
# Determine the total number of users
union(rt, non_rt) %>% length()
## [1] 1742
# Determine the number of users who has just retweeted
setdiff(rt, non_rt) %>% length()
## [1] 1302
# Prefill mean() with na.rm, and round() with digits = 1
mean_na_rm <- partial(mean, na.rm=TRUE)
round_one <- partial(round, digits=1)
# Compose a rounded_mean function
rounded_mean <- purrr::compose(round_one, mean_na_rm)
# Extract the non retweet
non_rt <- discard(rstudioconf, "is_retweet")
# Extract "favorite_count", and pass it to rounded_mean()
map_dbl(non_rt, "favorite_count") %>%
rounded_mean()
## [1] 3.3
# Combine as_vector(), compact(), and flatten()
flatten_to_vector <- purrr::compose(as_vector, compact, flatten)
# Complete the fonction
extractor <- function(list, what = "mentions_screen_name"){
map(list, what) %>%
flatten_to_vector()
}
# Create six_most, with tail(), sort(), and table()
six_most <- purrr::compose(tail, sort, table)
# Run extractor() on rstudioconf
extractor(rstudioconf) %>%
six_most()
## .
## JennyBryan hadleywickham AmeliaMN juliasilge drob
## 278 308 362 376 418
## rstudio
## 648
# Extract the "urls_url" elements, and flatten() the result
urls_clean <- map(rstudioconf, "urls_url") %>%
flatten()
# Remove the NULL
compact_urls <- compact(urls_clean)
compact_urls <- discard(urls_clean, is.na) # Due to creation of the list above, NULL became NA
# Create a mapper that detects the patten "github"
has_github <- as_mapper(~ str_detect(.x, "github"))
# Look for the "github" pattern, and sum the result
map_lgl( compact_urls, has_github ) %>%
sum()
## [1] 347
# Complete the function
ratio_pattern <- function(vec, pattern){
n_pattern <- str_detect(vec, pattern) %>% sum()
n_pattern / length(vec)
}
# Create flatten_and_compact()
extraDiscard <- function(x) { discard(x, is.na) } # address same NA issue as above
flatten_and_compact <- purrr::compose(compact, extraDiscard, flatten)
# Complete the pipe to get the ratio of URLs with "github"
map(rstudioconf, "urls_url") %>%
flatten_and_compact() %>%
ratio_pattern("github")
## [1] 0.2943172
# Create mean_above, a mapper that tests if .x is over 3.3
mean_above <- as_mapper(~ .x > 3.3)
# Prefil map_at() with "retweet_count", mean_above for above,
# and mean_above negation for below
above <- partial(map_at, .at = "retweet_count", .f = mean_above )
below <- partial(map_at, .at = "retweet_count", .f = negate(mean_above) )
# Map above() and below() on non_rt, keep the "retweet_count"
# ab <- map(non_rt, above) %>% keep("retweet_count")
# bl <- map(non_rt, below) %>% keep("retweet_count")
# Compare the size of both elements
# length(ab)
# length(bl)
# Get the max() of "retweet_count"
max_rt <- map_dbl(non_rt, "retweet_count") %>%
max()
# Prefill map_at() with a mapper testing if .x equal max_rt
# max_rt_calc <- partial(map_at, .at = "retweet_count", .f = ~.x==max_rt )
# Map max_rt_calc on non_rt, keep the retweet_count & flatten
# res <- map(non_rt, max_rt_calc) %>%
# keep("retweet_count") %>%
# flatten()
# Print the "screen_name" and "text" of the result
# res$screen_name
# res$text
Chapter 1 - Simplifying Iteration and Lists with purrr
The power of iteration:
Subsetting lists:
Multiple flavors of map():
Example code includes:
files <- list('data_from_1990.csv', 'data_from_1991.csv', 'data_from_1992.csv', 'data_from_1993.csv', 'data_from_1994.csv', 'data_from_1995.csv', 'data_from_1996.csv', 'data_from_1997.csv', 'data_from_1998.csv', 'data_from_1999.csv', 'data_from_2000.csv', 'data_from_2001.csv', 'data_from_2002.csv', 'data_from_2003.csv', 'data_from_2004.csv', 'data_from_2005.csv'
)
files <- map(files, function(x) { paste0("./RInputFiles/", x) })
# Initialize list
all_files <- list()
# For loop to read files into a list
for(i in seq_along(files)){
all_files[[i]] <- readr::read_csv(file = files[[i]])
}
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
# Output size of list object
length(all_files)
## [1] 16
# Use map to iterate
all_files_purrr <- purrr::map(files, read_csv)
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
## Parsed with column specification:
## cols(
## years = col_double(),
## a = col_double(),
## b = col_double()
## )
# Output size of list object
length(all_files_purrr)
## [1] 16
temp <- c("1", "2", "3", "4")
list_of_df <- list(temp, temp, temp, temp, temp, temp, temp, temp, temp, temp)
# Check the class type of the first element
class(list_of_df[[1]])
## [1] "character"
# Change each element from a character to a number
for(i in seq_along(list_of_df)){
list_of_df[[i]] <- as.numeric(list_of_df[[i]])
}
# Check the class type of the first element
class(list_of_df[[1]])
## [1] "numeric"
# Print out the list
list_of_df
## [[1]]
## [1] 1 2 3 4
##
## [[2]]
## [1] 1 2 3 4
##
## [[3]]
## [1] 1 2 3 4
##
## [[4]]
## [1] 1 2 3 4
##
## [[5]]
## [1] 1 2 3 4
##
## [[6]]
## [1] 1 2 3 4
##
## [[7]]
## [1] 1 2 3 4
##
## [[8]]
## [1] 1 2 3 4
##
## [[9]]
## [1] 1 2 3 4
##
## [[10]]
## [1] 1 2 3 4
temp <- c("1", "2", "3", "4")
list_of_df <- list(temp, temp, temp, temp, temp, temp, temp, temp, temp, temp)
# Check the class type of the first element
class(list_of_df[[1]])
## [1] "character"
# Change each character element to a number
list_of_df <- map(list_of_df, as.numeric)
# Check the class type of the first element again
class(list_of_df[[1]])
## [1] "numeric"
# Print out the list
list_of_df
## [[1]]
## [1] 1 2 3 4
##
## [[2]]
## [1] 1 2 3 4
##
## [[3]]
## [1] 1 2 3 4
##
## [[4]]
## [1] 1 2 3 4
##
## [[5]]
## [1] 1 2 3 4
##
## [[6]]
## [1] 1 2 3 4
##
## [[7]]
## [1] 1 2 3 4
##
## [[8]]
## [1] 1 2 3 4
##
## [[9]]
## [1] 1 2 3 4
##
## [[10]]
## [1] 1 2 3 4
# Load wesanderson dataset
data(wesanderson, package="repurrrsive")
# Get structure of first element in wesanderson
str(wesanderson[[1]])
## chr [1:4] "#F1BB7B" "#FD6467" "#5B1A18" "#D67236"
# Get structure of GrandBudapest element in wesanderson
str(wesanderson$GrandBudapest)
## chr [1:4] "#F1BB7B" "#FD6467" "#5B1A18" "#D67236"
# Third element of the first wesanderson vector
wesanderson[[1]][3]
## [1] "#5B1A18"
# Fourth element of the GrandBudapest wesanderson vector
wesanderson$GrandBudapest[4]
## [1] "#D67236"
data(sw_films, package="repurrrsive")
# Subset the first element of the sw_films data
sw_films[[1]]
## $title
## [1] "A New Hope"
##
## $episode_id
## [1] 4
##
## $opening_crawl
## [1] "It is a period of civil war.\r\nRebel spaceships, striking\r\nfrom a hidden base, have won\r\ntheir first victory against\r\nthe evil Galactic Empire.\r\n\r\nDuring the battle, Rebel\r\nspies managed to steal secret\r\nplans to the Empire's\r\nultimate weapon, the DEATH\r\nSTAR, an armored space\r\nstation with enough power\r\nto destroy an entire planet.\r\n\r\nPursued by the Empire's\r\nsinister agents, Princess\r\nLeia races home aboard her\r\nstarship, custodian of the\r\nstolen plans that can save her\r\npeople and restore\r\nfreedom to the galaxy...."
##
## $director
## [1] "George Lucas"
##
## $producer
## [1] "Gary Kurtz, Rick McCallum"
##
## $release_date
## [1] "1977-05-25"
##
## $characters
## [1] "http://swapi.co/api/people/1/" "http://swapi.co/api/people/2/"
## [3] "http://swapi.co/api/people/3/" "http://swapi.co/api/people/4/"
## [5] "http://swapi.co/api/people/5/" "http://swapi.co/api/people/6/"
## [7] "http://swapi.co/api/people/7/" "http://swapi.co/api/people/8/"
## [9] "http://swapi.co/api/people/9/" "http://swapi.co/api/people/10/"
## [11] "http://swapi.co/api/people/12/" "http://swapi.co/api/people/13/"
## [13] "http://swapi.co/api/people/14/" "http://swapi.co/api/people/15/"
## [15] "http://swapi.co/api/people/16/" "http://swapi.co/api/people/18/"
## [17] "http://swapi.co/api/people/19/" "http://swapi.co/api/people/81/"
##
## $planets
## [1] "http://swapi.co/api/planets/2/" "http://swapi.co/api/planets/3/"
## [3] "http://swapi.co/api/planets/1/"
##
## $starships
## [1] "http://swapi.co/api/starships/2/" "http://swapi.co/api/starships/3/"
## [3] "http://swapi.co/api/starships/5/" "http://swapi.co/api/starships/9/"
## [5] "http://swapi.co/api/starships/10/" "http://swapi.co/api/starships/11/"
## [7] "http://swapi.co/api/starships/12/" "http://swapi.co/api/starships/13/"
##
## $vehicles
## [1] "http://swapi.co/api/vehicles/4/" "http://swapi.co/api/vehicles/6/"
## [3] "http://swapi.co/api/vehicles/7/" "http://swapi.co/api/vehicles/8/"
##
## $species
## [1] "http://swapi.co/api/species/5/" "http://swapi.co/api/species/3/"
## [3] "http://swapi.co/api/species/2/" "http://swapi.co/api/species/1/"
## [5] "http://swapi.co/api/species/4/"
##
## $created
## [1] "2014-12-10T14:23:31.880000Z"
##
## $edited
## [1] "2015-04-11T09:46:52.774897Z"
##
## $url
## [1] "http://swapi.co/api/films/1/"
# Subset the first element of the sw_films data, title column
sw_films[[1]]$title
## [1] "A New Hope"
# Map over wesanderson to get the length of each element
map(wesanderson, length)
## $GrandBudapest
## [1] 4
##
## $Moonrise1
## [1] 4
##
## $Royal1
## [1] 4
##
## $Moonrise2
## [1] 4
##
## $Cavalcanti
## [1] 5
##
## $Royal2
## [1] 5
##
## $GrandBudapest2
## [1] 4
##
## $Moonrise3
## [1] 5
##
## $Chevalier
## [1] 4
##
## $Zissou
## [1] 5
##
## $FantasticFox
## [1] 5
##
## $Darjeeling
## [1] 5
##
## $Rushmore
## [1] 5
##
## $BottleRocket
## [1] 7
##
## $Darjeeling2
## [1] 5
# Map over wesanderson, and determine the length of each element
map(wesanderson, ~length(.x))
## $GrandBudapest
## [1] 4
##
## $Moonrise1
## [1] 4
##
## $Royal1
## [1] 4
##
## $Moonrise2
## [1] 4
##
## $Cavalcanti
## [1] 5
##
## $Royal2
## [1] 5
##
## $GrandBudapest2
## [1] 4
##
## $Moonrise3
## [1] 5
##
## $Chevalier
## [1] 4
##
## $Zissou
## [1] 5
##
## $FantasticFox
## [1] 5
##
## $Darjeeling
## [1] 5
##
## $Rushmore
## [1] 5
##
## $BottleRocket
## [1] 7
##
## $Darjeeling2
## [1] 5
# Map over wesanderson and determine the length of each element
map(wesanderson, length)
## $GrandBudapest
## [1] 4
##
## $Moonrise1
## [1] 4
##
## $Royal1
## [1] 4
##
## $Moonrise2
## [1] 4
##
## $Cavalcanti
## [1] 5
##
## $Royal2
## [1] 5
##
## $GrandBudapest2
## [1] 4
##
## $Moonrise3
## [1] 5
##
## $Chevalier
## [1] 4
##
## $Zissou
## [1] 5
##
## $FantasticFox
## [1] 5
##
## $Darjeeling
## [1] 5
##
## $Rushmore
## [1] 5
##
## $BottleRocket
## [1] 7
##
## $Darjeeling2
## [1] 5
# Create a numcolors column and fill with length of each wesanderson element
data.frame(numcolors = map_dbl(wesanderson, ~length(.x)))
## numcolors
## GrandBudapest 4
## Moonrise1 4
## Royal1 4
## Moonrise2 4
## Cavalcanti 5
## Royal2 5
## GrandBudapest2 4
## Moonrise3 5
## Chevalier 4
## Zissou 5
## FantasticFox 5
## Darjeeling 5
## Rushmore 5
## BottleRocket 7
## Darjeeling2 5
Chapter 2 - More Complex Iterations
Working with unnamed lists:
More map():
map2() and pmap():
Example code includes:
# Use pipes to check for names in sw_films
sw_films %>%
names()
## NULL
# Set names so each element of the list is named for the film title
sw_films_named <- sw_films %>%
set_names(map_chr(., "title"))
# Check to see if the names worked/are correct
names(sw_films_named)
## [1] "A New Hope" "Attack of the Clones"
## [3] "The Phantom Menace" "Revenge of the Sith"
## [5] "Return of the Jedi" "The Empire Strikes Back"
## [7] "The Force Awakens"
# Create a list of values from 1 through 10
numlist <- list(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
# Iterate over the numlist
map(numlist, ~.x %>% sqrt() %>% sin())
## [[1]]
## [1] 0.841471
##
## [[2]]
## [1] 0.9877659
##
## [[3]]
## [1] 0.9870266
##
## [[4]]
## [1] 0.9092974
##
## [[5]]
## [1] 0.7867491
##
## [[6]]
## [1] 0.6381576
##
## [[7]]
## [1] 0.4757718
##
## [[8]]
## [1] 0.3080717
##
## [[9]]
## [1] 0.14112
##
## [[10]]
## [1] -0.02068353
# List of sites north, east, and west
sites <- list("north", "east", "west")
# Create a list of dataframes, each with a years, a, and b column
list_of_df <- map(sites,
~data.frame(years = .x,
a = rnorm(mean = 5, n = 200, sd = 5/2),
b = rnorm(mean = 200, n = 200, sd = 15)))
map(list_of_df, head)
## [[1]]
## years a b
## 1 north 5.419245 185.6379
## 2 north 3.497293 191.3937
## 3 north 7.542372 203.1696
## 4 north 1.237095 200.3886
## 5 north 7.965691 202.3536
## 6 north 5.997626 198.6213
##
## [[2]]
## years a b
## 1 east 3.959230 210.2183
## 2 east 7.794994 202.4002
## 3 east 1.787426 215.2296
## 4 east 7.324983 211.6860
## 5 east 3.200095 167.6267
## 6 east 1.898869 176.9505
##
## [[3]]
## years a b
## 1 west 1.650724 200.3956
## 2 west 5.358391 196.1960
## 3 west 6.100584 193.0619
## 4 west 3.674544 226.0657
## 5 west 7.548899 183.7324
## 6 west 7.492224 218.4433
# Map over the models to look at the relationship of a vs b
list_of_df %>%
map(~ lm(a ~ b, data = .)) %>%
map(~summary(.))
## [[1]]
##
## Call:
## lm(formula = a ~ b, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9588 -1.7913 0.2551 1.9562 8.1414
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.624601 2.752832 2.043 0.0424 *
## b -0.003813 0.013789 -0.277 0.7824
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.763 on 198 degrees of freedom
## Multiple R-squared: 0.0003861, Adjusted R-squared: -0.004662
## F-statistic: 0.07647 on 1 and 198 DF, p-value: 0.7824
##
##
## [[2]]
##
## Call:
## lm(formula = a ~ b, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3600 -1.8732 -0.3591 2.0309 7.6241
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.573714 2.427035 2.297 0.0227 *
## b -0.003159 0.012032 -0.263 0.7931
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.722 on 198 degrees of freedom
## Multiple R-squared: 0.0003481, Adjusted R-squared: -0.004701
## F-statistic: 0.06895 on 1 and 198 DF, p-value: 0.7931
##
##
## [[3]]
##
## Call:
## lm(formula = a ~ b, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.1929 -1.6531 -0.0416 1.3332 7.8433
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.28039 2.38101 3.898 0.000133 ***
## b -0.02109 0.01185 -1.781 0.076507 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.553 on 198 degrees of freedom
## Multiple R-squared: 0.01576, Adjusted R-squared: 0.01079
## F-statistic: 3.171 on 1 and 198 DF, p-value: 0.07651
# Pull out the director element of sw_films in a list and character vector
map(sw_films, ~.x[["director"]])
## [[1]]
## [1] "George Lucas"
##
## [[2]]
## [1] "George Lucas"
##
## [[3]]
## [1] "George Lucas"
##
## [[4]]
## [1] "George Lucas"
##
## [[5]]
## [1] "Richard Marquand"
##
## [[6]]
## [1] "Irvin Kershner"
##
## [[7]]
## [1] "J. J. Abrams"
map_chr(sw_films, ~.x[["director"]])
## [1] "George Lucas" "George Lucas" "George Lucas" "George Lucas"
## [5] "Richard Marquand" "Irvin Kershner" "J. J. Abrams"
# Compare outputs when checking if director is George Lucas
map(sw_films, ~.x[["director"]] == "George Lucas")
## [[1]]
## [1] TRUE
##
## [[2]]
## [1] TRUE
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] TRUE
##
## [[5]]
## [1] FALSE
##
## [[6]]
## [1] FALSE
##
## [[7]]
## [1] FALSE
map_lgl(sw_films, ~.x[["director"]] == "George Lucas")
## [1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE
# Pull out episode_id element as list
map(sw_films, ~.x[["episode_id"]])
## [[1]]
## [1] 4
##
## [[2]]
## [1] 2
##
## [[3]]
## [1] 1
##
## [[4]]
## [1] 3
##
## [[5]]
## [1] 6
##
## [[6]]
## [1] 5
##
## [[7]]
## [1] 7
# Pull out episode_id element as double vector
map_dbl(sw_films, ~.x[["episode_id"]])
## [1] 4 2 1 3 6 5 7
# Pull out episode_id element as list
map(sw_films, ~.x[["episode_id"]])
## [[1]]
## [1] 4
##
## [[2]]
## [1] 2
##
## [[3]]
## [1] 1
##
## [[4]]
## [1] 3
##
## [[5]]
## [1] 6
##
## [[6]]
## [1] 5
##
## [[7]]
## [1] 7
# Pull out episode_id element as integer vector
map_int(sw_films, ~.x[["episode_id"]])
## [1] 4 2 1 3 6 5 7
# List of 1 through 3
means <- list(1, 2, 3)
# Create sites list
sites <- list("north", "west", "east")
# Map over two arguments: years and mu
list_of_files_map2 <- map2(sites, means, ~data.frame(sites = .x,
a = rnorm(mean = .y, n = 200, sd = (5/2))))
map(list_of_files_map2, head)
## [[1]]
## sites a
## 1 north 0.5819726
## 2 north -0.5746658
## 3 north 0.1643412
## 4 north -2.1005153
## 5 north -2.2215248
## 6 north -3.4375990
##
## [[2]]
## sites a
## 1 west 1.1073218
## 2 west -1.0274293
## 3 west 3.6507615
## 4 west 0.6089474
## 5 west -1.8287453
## 6 west 3.3056101
##
## [[3]]
## sites a
## 1 east 9.5459135
## 2 east 2.0063471
## 3 east 4.9414991
## 4 east 4.9324294
## 5 east 2.7412434
## 6 east 0.4964416
means <- list(1, 2, 3)
sigma <- list(1, 2, 3)
means2 <- list(0.5, 1, 1.5)
sigma2 <- list(0.5, 1, 1.5)
# Create a master list, a list of lists
pmapinputs <- list(sites = sites, means1 = means, sigma1 = sigma,
means2 = means2, sigma2 = sigma2)
# Map over the master list
list_of_files_pmap <- pmap(pmapinputs,
function(sites, means1, sigma1, means2, sigma2) {
data.frame(years = sites,
a = rnorm(mean = means1, n = 200, sd = sigma1),
b = rnorm(mean = means2, n = 200, sd = sigma2)
)
}
)
map(list_of_files_pmap, head)
## [[1]]
## years a b
## 1 north 0.4063538 0.09759733
## 2 north 2.0099744 0.11524764
## 3 north -0.8131808 0.62864546
## 4 north -0.1670964 0.71861601
## 5 north 0.7262024 0.32455889
## 6 north 0.6275768 0.73348169
##
## [[2]]
## years a b
## 1 west 2.5286327 1.7629410
## 2 west 1.9901082 0.6423823
## 3 west 4.0098040 0.2195289
## 4 west -0.4119218 0.6716433
## 5 west 2.0980822 2.1730813
## 6 west 3.8050698 0.9769756
##
## [[3]]
## years a b
## 1 east 2.9726321 -0.9428948
## 2 east 0.2710128 0.5352085
## 3 east 0.3507006 1.1646529
## 4 east 3.0210231 1.7266897
## 5 east -3.0043878 2.2508411
## 6 east 1.3111049 1.5101823
Chapter 3 - Troubleshooting Lists with purrr
How to purrr safely():
Another way to possibly() purrr:
purr is a walk() in the park:
Example code includes:
# Map safely over log
a <- list(-10, 1, 10, 0) %>%
map(safely(log, otherwise = NA_real_)) %>%
# Transpose the result
transpose()
## Warning in .f(...): NaNs produced
# Print the list
a
## $result
## $result[[1]]
## [1] NaN
##
## $result[[2]]
## [1] 0
##
## $result[[3]]
## [1] 2.302585
##
## $result[[4]]
## [1] -Inf
##
##
## $error
## $error[[1]]
## NULL
##
## $error[[2]]
## NULL
##
## $error[[3]]
## NULL
##
## $error[[4]]
## NULL
# Print the result element in the list
a[["result"]]
## [[1]]
## [1] NaN
##
## [[2]]
## [1] 0
##
## [[3]]
## [1] 2.302585
##
## [[4]]
## [1] -Inf
# Print the error element in the list
a[["error"]]
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
# Load sw_people data
data(sw_people, package="repurrrsive")
# Map over sw_people and pull out the height element
height_cm <- map(sw_people, "height") %>%
map(function(x) { ifelse(x == "unknown", NA, as.numeric(x)) })
# Map over sw_people and pull out the height element
height_ft <- map(sw_people , "height") %>%
map(safely(function(x){ ifelse(x == "unknown", NA, as.numeric(x) * 0.0328084) }, quiet = FALSE)) %>%
transpose()
# Print your list, the result element, and the error element
walk(height_ft, function(x) { print(x[1:10]) })
## [[1]]
## [1] 5.643045
##
## [[2]]
## [1] 5.479003
##
## [[3]]
## [1] 3.149606
##
## [[4]]
## [1] 6.627297
##
## [[5]]
## [1] 4.92126
##
## [[6]]
## [1] 5.839895
##
## [[7]]
## [1] 5.413386
##
## [[8]]
## [1] 3.182415
##
## [[9]]
## [1] 6.003937
##
## [[10]]
## [1] 5.971129
##
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
height_ft[["result"]][1:10]
## [[1]]
## [1] 5.643045
##
## [[2]]
## [1] 5.479003
##
## [[3]]
## [1] 3.149606
##
## [[4]]
## [1] 6.627297
##
## [[5]]
## [1] 4.92126
##
## [[6]]
## [1] 5.839895
##
## [[7]]
## [1] 5.413386
##
## [[8]]
## [1] 3.182415
##
## [[9]]
## [1] 6.003937
##
## [[10]]
## [1] 5.971129
height_ft[["error"]][1:10]
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
# Take the log of each element in the list
a <- list(-10, 1, 10, 0) %>%
map(possibly(function(x){ log(x) }, otherwise=NA_real_))
## Warning in log(x): NaNs produced
# Create a piped workflow that returns double vectors
height_cm %>%
map_dbl(possibly(function(x){ x * 0.0328084 }, otherwise=NA_real_))
## [1] 5.643045 5.479003 3.149606 6.627297 4.921260 5.839895 5.413386 3.182415
## [9] 6.003937 5.971129 6.167979 5.905512 7.480315 5.905512 5.675853 5.741470
## [17] 5.577428 5.905512 2.165354 5.577428 6.003937 6.561680 6.233596 5.807087
## [25] 5.741470 5.905512 4.921260 NA 2.887139 5.249344 6.332021 6.266404
## [33] 5.577428 6.430446 7.349082 6.758530 6.003937 4.494751 3.674541 6.003937
## [41] 5.347769 5.741470 5.905512 5.839895 3.083990 4.002625 5.347769 6.167979
## [49] 6.496063 6.430446 5.610236 6.036746 6.167979 8.661418 6.167979 6.430446
## [57] 6.069554 5.150919 6.003937 6.003937 5.577428 5.446194 5.413386 6.332021
## [65] 6.266404 6.003937 5.511811 6.496063 7.513124 6.988189 5.479003 2.591864
## [73] 3.149606 6.332021 6.266404 5.839895 7.086614 7.677166 6.167979 5.839895
## [81] 6.758530 NA NA NA NA NA 5.413386
films <- map_chr(sw_films, "url")
people <- map(sw_films, "characters")
people_by_film <- tibble::tibble(films = rep(films, times=map_int(people, length)),
film_url = unlist(people)
)
# Print with walk
walk(people_by_film, print)
## [1] "http://swapi.co/api/films/1/" "http://swapi.co/api/films/1/"
## [3] "http://swapi.co/api/films/1/" "http://swapi.co/api/films/1/"
## [5] "http://swapi.co/api/films/1/" "http://swapi.co/api/films/1/"
## [7] "http://swapi.co/api/films/1/" "http://swapi.co/api/films/1/"
## [9] "http://swapi.co/api/films/1/" "http://swapi.co/api/films/1/"
## [11] "http://swapi.co/api/films/1/" "http://swapi.co/api/films/1/"
## [13] "http://swapi.co/api/films/1/" "http://swapi.co/api/films/1/"
## [15] "http://swapi.co/api/films/1/" "http://swapi.co/api/films/1/"
## [17] "http://swapi.co/api/films/1/" "http://swapi.co/api/films/1/"
## [19] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [21] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [23] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [25] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [27] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [29] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [31] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [33] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [35] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [37] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [39] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [41] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [43] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [45] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [47] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [49] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [51] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [53] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [55] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [57] "http://swapi.co/api/films/5/" "http://swapi.co/api/films/5/"
## [59] "http://swapi.co/api/films/4/" "http://swapi.co/api/films/4/"
## [61] "http://swapi.co/api/films/4/" "http://swapi.co/api/films/4/"
## [63] "http://swapi.co/api/films/4/" "http://swapi.co/api/films/4/"
## [65] "http://swapi.co/api/films/4/" "http://swapi.co/api/films/4/"
## [67] "http://swapi.co/api/films/4/" "http://swapi.co/api/films/4/"
## [69] "http://swapi.co/api/films/4/" "http://swapi.co/api/films/4/"
## [71] "http://swapi.co/api/films/4/" "http://swapi.co/api/films/4/"
## [73] "http://swapi.co/api/films/4/" "http://swapi.co/api/films/4/"
## [75] "http://swapi.co/api/films/4/" "http://swapi.co/api/films/4/"
## [77] "http://swapi.co/api/films/4/" "http://swapi.co/api/films/4/"
## [79] "http://swapi.co/api/films/4/" "http://swapi.co/api/films/4/"
## [81] "http://swapi.co/api/films/4/" "http://swapi.co/api/films/4/"
## [83] "http://swapi.co/api/films/4/" "http://swapi.co/api/films/4/"
## [85] "http://swapi.co/api/films/4/" "http://swapi.co/api/films/4/"
## [87] "http://swapi.co/api/films/4/" "http://swapi.co/api/films/4/"
## [89] "http://swapi.co/api/films/4/" "http://swapi.co/api/films/4/"
## [91] "http://swapi.co/api/films/4/" "http://swapi.co/api/films/4/"
## [93] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/6/"
## [95] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/6/"
## [97] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/6/"
## [99] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/6/"
## [101] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/6/"
## [103] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/6/"
## [105] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/6/"
## [107] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/6/"
## [109] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/6/"
## [111] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/6/"
## [113] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/6/"
## [115] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/6/"
## [117] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/6/"
## [119] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/6/"
## [121] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/6/"
## [123] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/6/"
## [125] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/6/"
## [127] "http://swapi.co/api/films/3/" "http://swapi.co/api/films/3/"
## [129] "http://swapi.co/api/films/3/" "http://swapi.co/api/films/3/"
## [131] "http://swapi.co/api/films/3/" "http://swapi.co/api/films/3/"
## [133] "http://swapi.co/api/films/3/" "http://swapi.co/api/films/3/"
## [135] "http://swapi.co/api/films/3/" "http://swapi.co/api/films/3/"
## [137] "http://swapi.co/api/films/3/" "http://swapi.co/api/films/3/"
## [139] "http://swapi.co/api/films/3/" "http://swapi.co/api/films/3/"
## [141] "http://swapi.co/api/films/3/" "http://swapi.co/api/films/3/"
## [143] "http://swapi.co/api/films/3/" "http://swapi.co/api/films/3/"
## [145] "http://swapi.co/api/films/3/" "http://swapi.co/api/films/3/"
## [147] "http://swapi.co/api/films/2/" "http://swapi.co/api/films/2/"
## [149] "http://swapi.co/api/films/2/" "http://swapi.co/api/films/2/"
## [151] "http://swapi.co/api/films/2/" "http://swapi.co/api/films/2/"
## [153] "http://swapi.co/api/films/2/" "http://swapi.co/api/films/2/"
## [155] "http://swapi.co/api/films/2/" "http://swapi.co/api/films/2/"
## [157] "http://swapi.co/api/films/2/" "http://swapi.co/api/films/2/"
## [159] "http://swapi.co/api/films/2/" "http://swapi.co/api/films/2/"
## [161] "http://swapi.co/api/films/2/" "http://swapi.co/api/films/2/"
## [163] "http://swapi.co/api/films/7/" "http://swapi.co/api/films/7/"
## [165] "http://swapi.co/api/films/7/" "http://swapi.co/api/films/7/"
## [167] "http://swapi.co/api/films/7/" "http://swapi.co/api/films/7/"
## [169] "http://swapi.co/api/films/7/" "http://swapi.co/api/films/7/"
## [171] "http://swapi.co/api/films/7/" "http://swapi.co/api/films/7/"
## [173] "http://swapi.co/api/films/7/"
## [1] "http://swapi.co/api/people/1/" "http://swapi.co/api/people/2/"
## [3] "http://swapi.co/api/people/3/" "http://swapi.co/api/people/4/"
## [5] "http://swapi.co/api/people/5/" "http://swapi.co/api/people/6/"
## [7] "http://swapi.co/api/people/7/" "http://swapi.co/api/people/8/"
## [9] "http://swapi.co/api/people/9/" "http://swapi.co/api/people/10/"
## [11] "http://swapi.co/api/people/12/" "http://swapi.co/api/people/13/"
## [13] "http://swapi.co/api/people/14/" "http://swapi.co/api/people/15/"
## [15] "http://swapi.co/api/people/16/" "http://swapi.co/api/people/18/"
## [17] "http://swapi.co/api/people/19/" "http://swapi.co/api/people/81/"
## [19] "http://swapi.co/api/people/2/" "http://swapi.co/api/people/3/"
## [21] "http://swapi.co/api/people/6/" "http://swapi.co/api/people/7/"
## [23] "http://swapi.co/api/people/10/" "http://swapi.co/api/people/11/"
## [25] "http://swapi.co/api/people/20/" "http://swapi.co/api/people/21/"
## [27] "http://swapi.co/api/people/22/" "http://swapi.co/api/people/33/"
## [29] "http://swapi.co/api/people/36/" "http://swapi.co/api/people/40/"
## [31] "http://swapi.co/api/people/43/" "http://swapi.co/api/people/46/"
## [33] "http://swapi.co/api/people/51/" "http://swapi.co/api/people/52/"
## [35] "http://swapi.co/api/people/53/" "http://swapi.co/api/people/58/"
## [37] "http://swapi.co/api/people/59/" "http://swapi.co/api/people/60/"
## [39] "http://swapi.co/api/people/61/" "http://swapi.co/api/people/62/"
## [41] "http://swapi.co/api/people/63/" "http://swapi.co/api/people/64/"
## [43] "http://swapi.co/api/people/65/" "http://swapi.co/api/people/66/"
## [45] "http://swapi.co/api/people/67/" "http://swapi.co/api/people/68/"
## [47] "http://swapi.co/api/people/69/" "http://swapi.co/api/people/70/"
## [49] "http://swapi.co/api/people/71/" "http://swapi.co/api/people/72/"
## [51] "http://swapi.co/api/people/73/" "http://swapi.co/api/people/74/"
## [53] "http://swapi.co/api/people/75/" "http://swapi.co/api/people/76/"
## [55] "http://swapi.co/api/people/77/" "http://swapi.co/api/people/78/"
## [57] "http://swapi.co/api/people/82/" "http://swapi.co/api/people/35/"
## [59] "http://swapi.co/api/people/2/" "http://swapi.co/api/people/3/"
## [61] "http://swapi.co/api/people/10/" "http://swapi.co/api/people/11/"
## [63] "http://swapi.co/api/people/16/" "http://swapi.co/api/people/20/"
## [65] "http://swapi.co/api/people/21/" "http://swapi.co/api/people/32/"
## [67] "http://swapi.co/api/people/33/" "http://swapi.co/api/people/34/"
## [69] "http://swapi.co/api/people/36/" "http://swapi.co/api/people/37/"
## [71] "http://swapi.co/api/people/38/" "http://swapi.co/api/people/39/"
## [73] "http://swapi.co/api/people/40/" "http://swapi.co/api/people/41/"
## [75] "http://swapi.co/api/people/42/" "http://swapi.co/api/people/43/"
## [77] "http://swapi.co/api/people/44/" "http://swapi.co/api/people/46/"
## [79] "http://swapi.co/api/people/48/" "http://swapi.co/api/people/49/"
## [81] "http://swapi.co/api/people/50/" "http://swapi.co/api/people/51/"
## [83] "http://swapi.co/api/people/52/" "http://swapi.co/api/people/53/"
## [85] "http://swapi.co/api/people/54/" "http://swapi.co/api/people/55/"
## [87] "http://swapi.co/api/people/56/" "http://swapi.co/api/people/57/"
## [89] "http://swapi.co/api/people/58/" "http://swapi.co/api/people/59/"
## [91] "http://swapi.co/api/people/47/" "http://swapi.co/api/people/35/"
## [93] "http://swapi.co/api/people/1/" "http://swapi.co/api/people/2/"
## [95] "http://swapi.co/api/people/3/" "http://swapi.co/api/people/4/"
## [97] "http://swapi.co/api/people/5/" "http://swapi.co/api/people/6/"
## [99] "http://swapi.co/api/people/7/" "http://swapi.co/api/people/10/"
## [101] "http://swapi.co/api/people/11/" "http://swapi.co/api/people/12/"
## [103] "http://swapi.co/api/people/13/" "http://swapi.co/api/people/20/"
## [105] "http://swapi.co/api/people/21/" "http://swapi.co/api/people/33/"
## [107] "http://swapi.co/api/people/46/" "http://swapi.co/api/people/51/"
## [109] "http://swapi.co/api/people/52/" "http://swapi.co/api/people/53/"
## [111] "http://swapi.co/api/people/54/" "http://swapi.co/api/people/55/"
## [113] "http://swapi.co/api/people/56/" "http://swapi.co/api/people/58/"
## [115] "http://swapi.co/api/people/63/" "http://swapi.co/api/people/64/"
## [117] "http://swapi.co/api/people/67/" "http://swapi.co/api/people/68/"
## [119] "http://swapi.co/api/people/75/" "http://swapi.co/api/people/78/"
## [121] "http://swapi.co/api/people/79/" "http://swapi.co/api/people/80/"
## [123] "http://swapi.co/api/people/81/" "http://swapi.co/api/people/82/"
## [125] "http://swapi.co/api/people/83/" "http://swapi.co/api/people/35/"
## [127] "http://swapi.co/api/people/1/" "http://swapi.co/api/people/2/"
## [129] "http://swapi.co/api/people/3/" "http://swapi.co/api/people/4/"
## [131] "http://swapi.co/api/people/5/" "http://swapi.co/api/people/10/"
## [133] "http://swapi.co/api/people/13/" "http://swapi.co/api/people/14/"
## [135] "http://swapi.co/api/people/16/" "http://swapi.co/api/people/18/"
## [137] "http://swapi.co/api/people/20/" "http://swapi.co/api/people/21/"
## [139] "http://swapi.co/api/people/22/" "http://swapi.co/api/people/25/"
## [141] "http://swapi.co/api/people/27/" "http://swapi.co/api/people/28/"
## [143] "http://swapi.co/api/people/29/" "http://swapi.co/api/people/30/"
## [145] "http://swapi.co/api/people/31/" "http://swapi.co/api/people/45/"
## [147] "http://swapi.co/api/people/1/" "http://swapi.co/api/people/2/"
## [149] "http://swapi.co/api/people/3/" "http://swapi.co/api/people/4/"
## [151] "http://swapi.co/api/people/5/" "http://swapi.co/api/people/10/"
## [153] "http://swapi.co/api/people/13/" "http://swapi.co/api/people/14/"
## [155] "http://swapi.co/api/people/18/" "http://swapi.co/api/people/20/"
## [157] "http://swapi.co/api/people/21/" "http://swapi.co/api/people/22/"
## [159] "http://swapi.co/api/people/23/" "http://swapi.co/api/people/24/"
## [161] "http://swapi.co/api/people/25/" "http://swapi.co/api/people/26/"
## [163] "http://swapi.co/api/people/1/" "http://swapi.co/api/people/3/"
## [165] "http://swapi.co/api/people/5/" "http://swapi.co/api/people/13/"
## [167] "http://swapi.co/api/people/14/" "http://swapi.co/api/people/27/"
## [169] "http://swapi.co/api/people/84/" "http://swapi.co/api/people/85/"
## [171] "http://swapi.co/api/people/86/" "http://swapi.co/api/people/87/"
## [173] "http://swapi.co/api/people/88/"
data(gapminder, package="gapminder")
str(gapminder)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ pop : int 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num 779 821 853 836 740 ...
gap_split <- split(gapminder, gapminder$country)
gap_split[[1]]
## # A tibble: 12 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## 11 Afghanistan Asia 2002 42.1 25268405 727.
## 12 Afghanistan Asia 2007 43.8 31889923 975.
# Map over the first 10 elements of gap_split
plots <- map2(gap_split[1:10], names(gap_split[1:10]),
~ ggplot(.x, aes(year, lifeExp)) + geom_line() + labs(title = .y)
)
# Object name, then function name
walk(plots, print)
Chapter 4 - Problem Solving with purrr
Using purrr in your workflow:
Even more complex problems:
Graphs in purrr:
Wrap up:
Example code includes:
# Load the data
data(gh_users, package="repurrrsive")
# Check if data has names
names(gh_users)
## NULL
# Map over name element of list
map(gh_users, ~.x[["name"]])
## [[1]]
## [1] "Gábor Csárdi"
##
## [[2]]
## [1] "Jennifer (Jenny) Bryan"
##
## [[3]]
## [1] "Jeff L."
##
## [[4]]
## [1] "Julia Silge"
##
## [[5]]
## [1] "Thomas J. Leeper"
##
## [[6]]
## [1] "Maëlle Salmon"
# Name gh_users with the names of the users
gh_users <- gh_users %>%
set_names(map_chr(gh_users, "name"))
# Check gh_repos structure
data(gh_repos, package="repurrrsive")
# str(gh_repos) # List is much too long for str() printing
# Name gh_repos with the names of the repo owner
gh_repos_named <- gh_repos %>%
map_chr(~map_chr(.x, ~.x$owner$login)[1]) %>%
set_names(gh_repos, .)
# Determine who joined github first
map_chr(gh_users, ~.x[["created_at"]]) %>%
set_names(map_chr(gh_users, "name")) %>%
sort()
## Jennifer (Jenny) Bryan Gábor Csárdi Jeff L.
## "2011-02-03T22:37:41Z" "2011-03-09T17:29:25Z" "2012-03-24T18:16:43Z"
## Thomas J. Leeper Maëlle Salmon Julia Silge
## "2013-02-07T21:07:00Z" "2014-08-05T08:10:04Z" "2015-05-19T02:51:23Z"
# Determine user versus organization
map_lgl(gh_users, ~.x[["type"]] == "User") %>%
sum() == length(gh_users)
## [1] TRUE
# Determine who has the most public repositories
map_int(gh_users, ~.x[["public_repos"]]) %>%
set_names(map_chr(gh_users, "name")) %>%
sort()
## Julia Silge Maëlle Salmon Gábor Csárdi
## 26 31 52
## Jeff L. Thomas J. Leeper Jennifer (Jenny) Bryan
## 67 99 168
# Set names of gh_repos with name subelement
gh_repos <- gh_repos %>%
map_chr(~map_chr(.x, ~.x$owner$login)[1]) %>%
set_names(gh_repos, .)
# Check to make sure list has the right names
names(gh_repos)
## [1] "gaborcsardi" "jennybc" "jtleek" "juliasilge" "leeper"
## [6] "masalmon"
# Map over gh_repos to generate numeric output
map(gh_repos,
~map_dbl(.x, ~.x[["size"]])) %>%
# Grab the largest element
map(~max(.x))
## $gaborcsardi
## [1] 39461
##
## $jennybc
## [1] 96325
##
## $jtleek
## [1] 374812
##
## $juliasilge
## [1] 24070
##
## $leeper
## [1] 558176
##
## $masalmon
## [1] 76455
gh_users_df <- tibble::tibble(public_repos=map_int(gh_users, ~.x[["public_repos"]]),
followers=map_int(gh_users, "followers")
)
# Scatter plot of public repos and followers
ggplot(data = gh_users_df, aes(x = public_repos, y = followers)) +
geom_point()
map(gh_repos_named, "followers")
## $gaborcsardi
## NULL
##
## $jennybc
## NULL
##
## $jtleek
## NULL
##
## $juliasilge
## NULL
##
## $leeper
## NULL
##
## $masalmon
## NULL
# Histogram of followers
gh_users_df %>%
ggplot(aes(x = followers)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Create a dataframe with four columns
map_df(gh_users, `[`, c("login", "name", "followers", "public_repos")) %>%
# Plot followers by public_repos
ggplot(., aes(x = followers, y = public_repos)) +
# Create scatter plots
geom_point()
# Turn data into correct dataframe format
film_by_character <- tibble(filmtitle = map_chr(sw_films, "title")) %>%
transmute(filmtitle, characters = map(sw_films, "characters")) %>%
unnest()
## Warning: `cols` is now required.
## Please use `cols = c(characters)`
# Pull out elements from sw_people
sw_characters <- map_df(sw_people, `[`, c("height", "mass", "name", "url"))
# Join the two new objects
inner_join(film_by_character, sw_characters, by = c("characters" = "url")) %>%
# Make sure the columns are numbers
mutate(height1 = ifelse(height=="unknown", NA, as.numeric(height)),
mass1 = ifelse(mass=="unknown", NA, as.numeric(stringr::str_replace(mass, ",", "")))
) %>%
filter(!is.na(height)) %>%
ggplot(aes(x = height)) +
geom_bar(stat="count") +
# geom_histogram(stat = "count") +
facet_wrap(~filmtitle)
## Warning in ifelse(height == "unknown", NA, as.numeric(height)): NAs introduced
## by coercion
## Warning in ifelse(mass == "unknown", NA, as.numeric(stringr::str_replace(mass, :
## NAs introduced by coercion
Chapter 1 - Joining Multiple data.tables
Introduction:
Merge function:
Left and right joins:
Example code includes:
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:xts':
##
## first, last
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
netflix <- fread("./RInputFiles/netflix_2017.csv", sep=",")
imdb <- fread("./RInputFiles/imdb_ratings.csv", sep=",")
# What data.tables are in my R session?
tables()
## Registered S3 methods overwritten by 'lava':
## method from
## plot.sim huge
## print.sim huge
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## NAME NROW NCOL MB COLS KEY
## 1: imdb 11 2 0 title,rating
## 2: netflix 8 3 0 title,episodes,release_date
## Total: 0MB
# View the first six rows
head(netflix)
## title episodes release_date
## 1: A Series of Unfortunate Events 8 13-Jan-17
## 2: 13 Reasons Why 13 31-Mar-17
## 3: Gypsy 10 30-Jun-17
## 4: Ozark 10 21-Jul-17
## 5: Mindhunter 10 13-Oct-17
## 6: Longmire 10 17-Nov-17
head(imdb)
## title rating
## 1: The Orville 7.7
## 2: Big Mouth 8.3
## 3: The Gifted 8.2
## 4: Gypsy 7.0
## 5: Inhumans 5.4
## 6: 13 Reasons Why 8.4
# Print the structure
str(netflix)
## Classes 'data.table' and 'data.frame': 8 obs. of 3 variables:
## $ title : chr "A Series of Unfortunate Events" "13 Reasons Why" "Gypsy" "Ozark" ...
## $ episodes : int 8 13 10 10 10 10 6 10
## $ release_date: chr "13-Jan-17" "31-Mar-17" "30-Jun-17" "21-Jul-17" ...
## - attr(*, ".internal.selfref")=<externalptr>
str(imdb)
## Classes 'data.table' and 'data.frame': 11 obs. of 2 variables:
## $ title : chr "The Orville" "Big Mouth" "The Gifted" "Gypsy" ...
## $ rating: num 7.7 8.3 8.2 7 5.4 8.4 7.3 8.9 8.4 8.5 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Print the data.tables in your R session
netflix
## title episodes release_date
## 1: A Series of Unfortunate Events 8 13-Jan-17
## 2: 13 Reasons Why 13 31-Mar-17
## 3: Gypsy 10 30-Jun-17
## 4: Ozark 10 21-Jul-17
## 5: Mindhunter 10 13-Oct-17
## 6: Longmire 10 17-Nov-17
## 7: Godless 6 22-Nov-17
## 8: Dark 10 1-Dec-17
imdb
## title rating
## 1: The Orville 7.7
## 2: Big Mouth 8.3
## 3: The Gifted 8.2
## 4: Gypsy 7.0
## 5: Inhumans 5.4
## 6: 13 Reasons Why 8.4
## 7: Star Trek: Discovery 7.3
## 8: Mindhunter 8.9
## 9: Mystery Science Theatre 3000: The Return 8.4
## 10: Ozark 8.5
## 11: A Series of Unfortunate Events 7.9
# Inner join netflix and imdb
merge(netflix, imdb, by = "title")
## title episodes release_date rating
## 1: 13 Reasons Why 13 31-Mar-17 8.4
## 2: A Series of Unfortunate Events 8 13-Jan-17 7.9
## 3: Gypsy 10 30-Jun-17 7.0
## 4: Mindhunter 10 13-Oct-17 8.9
## 5: Ozark 10 21-Jul-17 8.5
# Full join netflix and imdb
merge(netflix, imdb, by = "title", all=TRUE)
## title episodes release_date rating
## 1: 13 Reasons Why 13 31-Mar-17 8.4
## 2: A Series of Unfortunate Events 8 13-Jan-17 7.9
## 3: Big Mouth NA <NA> 8.3
## 4: Dark 10 1-Dec-17 NA
## 5: Godless 6 22-Nov-17 NA
## 6: Gypsy 10 30-Jun-17 7.0
## 7: Inhumans NA <NA> 5.4
## 8: Longmire 10 17-Nov-17 NA
## 9: Mindhunter 10 13-Oct-17 8.9
## 10: Mystery Science Theatre 3000: The Return NA <NA> 8.4
## 11: Ozark 10 21-Jul-17 8.5
## 12: Star Trek: Discovery NA <NA> 7.3
## 13: The Gifted NA <NA> 8.2
## 14: The Orville NA <NA> 7.7
# Full join imdb and netflix
merge(imdb, netflix, by = "title", all = TRUE)
## title rating episodes release_date
## 1: 13 Reasons Why 8.4 13 31-Mar-17
## 2: A Series of Unfortunate Events 7.9 8 13-Jan-17
## 3: Big Mouth 8.3 NA <NA>
## 4: Dark NA 10 1-Dec-17
## 5: Godless NA 6 22-Nov-17
## 6: Gypsy 7.0 10 30-Jun-17
## 7: Inhumans 5.4 NA <NA>
## 8: Longmire NA 10 17-Nov-17
## 9: Mindhunter 8.9 10 13-Oct-17
## 10: Mystery Science Theatre 3000: The Return 8.4 NA <NA>
## 11: Ozark 8.5 10 21-Jul-17
## 12: Star Trek: Discovery 7.3 NA <NA>
## 13: The Gifted 8.2 NA <NA>
## 14: The Orville 7.7 NA <NA>
# Left join imdb to netflix
merge(netflix, imdb, by="title", all.x=TRUE)
## title episodes release_date rating
## 1: 13 Reasons Why 13 31-Mar-17 8.4
## 2: A Series of Unfortunate Events 8 13-Jan-17 7.9
## 3: Dark 10 1-Dec-17 NA
## 4: Godless 6 22-Nov-17 NA
## 5: Gypsy 10 30-Jun-17 7.0
## 6: Longmire 10 17-Nov-17 NA
## 7: Mindhunter 10 13-Oct-17 8.9
## 8: Ozark 10 21-Jul-17 8.5
# Right join imdb to netflix
merge(netflix, imdb, by="title", all.y=TRUE)
## title episodes release_date rating
## 1: 13 Reasons Why 13 31-Mar-17 8.4
## 2: A Series of Unfortunate Events 8 13-Jan-17 7.9
## 3: Big Mouth NA <NA> 8.3
## 4: Gypsy 10 30-Jun-17 7.0
## 5: Inhumans NA <NA> 5.4
## 6: Mindhunter 10 13-Oct-17 8.9
## 7: Mystery Science Theatre 3000: The Return NA <NA> 8.4
## 8: Ozark 10 21-Jul-17 8.5
## 9: Star Trek: Discovery NA <NA> 7.3
## 10: The Gifted NA <NA> 8.2
## 11: The Orville NA <NA> 7.7
# Compare to a left join of netflix to imdb
merge(imdb, netflix, by="title", all.x=TRUE)
## title rating episodes release_date
## 1: 13 Reasons Why 8.4 13 31-Mar-17
## 2: A Series of Unfortunate Events 7.9 8 13-Jan-17
## 3: Big Mouth 8.3 NA <NA>
## 4: Gypsy 7.0 10 30-Jun-17
## 5: Inhumans 5.4 NA <NA>
## 6: Mindhunter 8.9 10 13-Oct-17
## 7: Mystery Science Theatre 3000: The Return 8.4 NA <NA>
## 8: Ozark 8.5 10 21-Jul-17
## 9: Star Trek: Discovery 7.3 NA <NA>
## 10: The Gifted 8.2 NA <NA>
## 11: The Orville 7.7 NA <NA>
australia_area <- fread("./RInputFiles/australia_area.csv", sep=",")
australia_capitals <- fread("./RInputFiles/australia_capitals.csv", sep=",")
australia_cities_top20 <- fread("./RInputFiles/australia_cities_top20.csv", sep=",")
# Identify the key for joining capitals and population
capitals_population_key <- "city"
# Left join population to capitals
capital_pop <- merge(australia_capitals,
australia_cities_top20[, c("city", "population")],
by=capitals_population_key, all.x=TRUE
)
capital_pop
## city state country population
## 1: Adelaide South Australia Australia 1324279
## 2: Brisbane Queensland Australia 2360241
## 3: Canberraâ\200“Queanbeyan Australian Capital Territory Australia NA
## 4: Darwin Northern Territory Australia 145916
## 5: Hobart Tasmania Australia 224462
## 6: Melbourne Victoria Australia 4725316
## 7: Perth Western Australia Australia 2022044
## 8: Sydney New South Wales Australia 5029768
# Identify the key for joining capital_pop and area
capital_pop_area_key <- "state"
# Inner join area to capital pop
australia_stats <- merge(capital_pop, australia_area[, c("state", "area_km2")], by=capital_pop_area_key)
# Print the final result
australia_stats
## state city country population
## 1: Australian Capital Territory Canberraâ\200“Queanbeyan Australia NA
## 2: New South Wales Sydney Australia 5029768
## 3: Northern Territory Darwin Australia 145916
## 4: Queensland Brisbane Australia 2360241
## 5: South Australia Adelaide Australia 1324279
## 6: Tasmania Hobart Australia 224462
## 7: Victoria Melbourne Australia 4725316
## 8: Western Australia Perth Australia 2022044
## area_km2
## 1: 2358
## 2: 800641
## 3: 1349129
## 4: 1730647
## 5: 983482
## 6: 68401
## 7: 227416
## 8: 2529875
Chapter 2 - Joins Using data.table Syntax
Joins using data.table syntax:
Setting and viewing data.table keys:
Incorporating joins in the data.table workflow:
Example code includes:
# Right join population to capitals using data.table syntax
australia_capitals[australia_cities_top20, on = "city"]
## city state country population percentage
## 1: Sydney New South Wales Australia 5029768 0.2074
## 2: Melbourne Victoria Australia 4725316 0.1924
## 3: Brisbane Queensland Australia 2360241 0.0974
## 4: Perth Western Australia Australia 2022044 0.0856
## 5: Adelaide South Australia Australia 1324279 0.0550
## 6: Gold Coast-Tweed Heads <NA> <NA> 646983 0.0264
## 7: Newcastle-Maitland <NA> <NA> 436171 0.0182
## 8: Canberra-Queanbeyan <NA> <NA> 435019 0.0178
## 9: Sunshine Coast <NA> <NA> 317404 0.0127
## 10: Wollongong <NA> <NA> 295669 0.0123
## 11: Hobart Tasmania Australia 224462 0.0092
## 12: Geelong <NA> <NA> 192393 0.0079
## 13: Townsville <NA> <NA> 178864 0.0076
## 14: Cairns <NA> <NA> 150041 0.0062
## 15: Darwin Northern Territory Australia 145916 0.0060
## 16: Toowoomba <NA> <NA> 114024 0.0048
## 17: Ballarat <NA> <NA> 101588 0.0042
## 18: Bendigo <NA> <NA> 95587 0.0039
## 19: Albury-Wodonga <NA> <NA> 90576 0.0037
## 20: Launceston <NA> <NA> 86335 0.0036
# Right join using merge
merge(australia_capitals, australia_cities_top20, by = "city", all.y = TRUE)
## city state country population percentage
## 1: Adelaide South Australia Australia 1324279 0.0550
## 2: Albury-Wodonga <NA> <NA> 90576 0.0037
## 3: Ballarat <NA> <NA> 101588 0.0042
## 4: Bendigo <NA> <NA> 95587 0.0039
## 5: Brisbane Queensland Australia 2360241 0.0974
## 6: Cairns <NA> <NA> 150041 0.0062
## 7: Canberra-Queanbeyan <NA> <NA> 435019 0.0178
## 8: Darwin Northern Territory Australia 145916 0.0060
## 9: Geelong <NA> <NA> 192393 0.0079
## 10: Gold Coast-Tweed Heads <NA> <NA> 646983 0.0264
## 11: Hobart Tasmania Australia 224462 0.0092
## 12: Launceston <NA> <NA> 86335 0.0036
## 13: Melbourne Victoria Australia 4725316 0.1924
## 14: Newcastle-Maitland <NA> <NA> 436171 0.0182
## 15: Perth Western Australia Australia 2022044 0.0856
## 16: Sunshine Coast <NA> <NA> 317404 0.0127
## 17: Sydney New South Wales Australia 5029768 0.2074
## 18: Toowoomba <NA> <NA> 114024 0.0048
## 19: Townsville <NA> <NA> 178864 0.0076
## 20: Wollongong <NA> <NA> 295669 0.0123
# Inner join with the data.table syntax
australia_capitals[australia_cities_top20, on="city", nomatch=0L]
## city state country population percentage
## 1: Sydney New South Wales Australia 5029768 0.2074
## 2: Melbourne Victoria Australia 4725316 0.1924
## 3: Brisbane Queensland Australia 2360241 0.0974
## 4: Perth Western Australia Australia 2022044 0.0856
## 5: Adelaide South Australia Australia 1324279 0.0550
## 6: Hobart Tasmania Australia 224462 0.0092
## 7: Darwin Northern Territory Australia 145916 0.0060
# Anti-join capitals to population
australia_cities_top20[!australia_capitals, on="city"]
## city population percentage
## 1: Gold Coast-Tweed Heads 646983 0.0264
## 2: Newcastle-Maitland 436171 0.0182
## 3: Canberra-Queanbeyan 435019 0.0178
## 4: Sunshine Coast 317404 0.0127
## 5: Wollongong 295669 0.0123
## 6: Geelong 192393 0.0079
## 7: Townsville 178864 0.0076
## 8: Cairns 150041 0.0062
## 9: Toowoomba 114024 0.0048
## 10: Ballarat 101588 0.0042
## 11: Bendigo 95587 0.0039
## 12: Albury-Wodonga 90576 0.0037
## 13: Launceston 86335 0.0036
# Anti-join capitals to area
australia_area[!australia_capitals, on="state"]
## state area_km2 percentage
## 1: Australian Antarctic Territory 5896500 NA
## 2: Heard Island and McDonald Islands 372 0
## 3: Ashmore and Cartier Islands 199 0
## 4: Christmas Island 135 0
## 5: Jervis Bay Territory 73 0
## 6: Norfolk Island 35 0
## 7: Cocos (Keeling) Islands 14 0
## 8: Coral Sea Islands 10 0
# Set the keys
setkey(netflix, "title")
setkey(imdb, "title")
# Inner join
netflix[imdb, nomatch=0L]
## title episodes release_date rating
## 1: 13 Reasons Why 13 31-Mar-17 8.4
## 2: A Series of Unfortunate Events 8 13-Jan-17 7.9
## 3: Gypsy 10 30-Jun-17 7.0
## 4: Mindhunter 10 13-Oct-17 8.9
## 5: Ozark 10 21-Jul-17 8.5
# Check for keys
haskey(netflix)
## [1] TRUE
haskey(imdb)
## [1] TRUE
# Find the key
the_key <- "title"
# Set the key for the other data.table
setkeyv(imdb, the_key)
# Inner join capitals to population
australia_cities_top20[australia_capitals, on="city", nomatch=0L]
## city population percentage state country
## 1: Sydney 5029768 0.2074 New South Wales Australia
## 2: Melbourne 4725316 0.1924 Victoria Australia
## 3: Brisbane 2360241 0.0974 Queensland Australia
## 4: Perth 2022044 0.0856 Western Australia Australia
## 5: Adelaide 1324279 0.0550 South Australia Australia
## 6: Hobart 224462 0.0092 Tasmania Australia
## 7: Darwin 145916 0.0060 Northern Territory Australia
# Join and sum
australia_cities_top20[australia_capitals, on = .(city), nomatch = 0, j = sum(percentage)]
## [1] 0.653
continents <- fread("./RInputFiles/continents.csv", sep=",")
life_exp <- fread("./RInputFiles/gapminder_life_expectancy_2010.csv", sep=",")
life_exp <- life_exp %>% rename(years = life_expectancy)
str(continents)
## Classes 'data.table' and 'data.frame': 235 obs. of 2 variables:
## $ continent: chr "africa" "africa" "africa" "africa" ...
## $ country : chr "Algeria" "Angola" "Benin" "Botswana" ...
## - attr(*, ".internal.selfref")=<externalptr>
str(life_exp)
## Classes 'data.table' and 'data.frame': 208 obs. of 2 variables:
## $ country: chr "Afghanistan" "Albania" "Algeria" "American Samoa" ...
## $ years : num 53.6 77.2 76 72.8 84.7 ...
## - attr(*, ".internal.selfref")=<externalptr>
# What countries are listed in multiple continents?
continents[life_exp, on = .(country), .N, by = .EACHI][N > 1]
## country N
## 1: Armenia 2
## 2: Azerbaijan 2
## 3: Cyprus 2
## 4: Georgia 2
## 5: Kazakhstan 2
## 6: Russia 2
## 7: Turkey 2
# Calculate average life expectancy per continent:
avg_life_expectancy <- continents[life_exp, on = .(country), nomatch=0L][, j = mean(years), by = continent]
avg_life_expectancy
## continent V1
## 1: asia 73.27039
## 2: europe 77.43404
## 3: africa 61.00556
## 4: north_america 73.73964
## 5: south_america 74.41000
## 6: oceania 69.56077
Chapter 3 - Diagnosing and Fixing Common Join Problems
Complex keys:
Tricky columns:
Duplicate matches:
Example code includes:
guardians <- fread("./RInputFiles/school_db_guardians.tsv")
locations <- fread("./RInputFiles/school_db_locations.tsv")
students <- fread("./RInputFiles/school_db_students.tsv")
subjects <- fread("./RInputFiles/school_db_subjects.tsv")
teachers <- fread("./RInputFiles/school_db_teachers.tsv")
# Full join
merge(students, guardians, by="name", all=TRUE)
## name sex.x age.x guardian sex.y age.y phone
## 1: Aciano <NA> NA <NA> M 44 0163-680-95557
## 2: Adara F 16 Kiana <NA> NA <NA>
## 3: Caleb M 15 Tyler <NA> NA <NA>
## 4: Cierra F 17 Kiana <NA> NA <NA>
## 5: Elsa F 17 John <NA> NA <NA>
## 6: John <NA> NA <NA> M 34 0163-745-07369
## 7: Kalvin M 17 John <NA> NA <NA>
## 8: Kiana <NA> NA <NA> F 38 0163-875-41705
## 9: Makaela F 17 Nicole <NA> NA <NA>
## 10: Nicole <NA> NA <NA> F 26 0163-266-89055
## 11: Tyler <NA> NA <NA> M 48 0165-526-80087
## 12: Yelena F 17 Aciano <NA> NA <NA>
students[guardians, on="name"]
## name sex age guardian i.sex i.age phone
## 1: John <NA> NA <NA> M 34 0163-745-07369
## 2: Kiana <NA> NA <NA> F 38 0163-875-41705
## 3: Tyler <NA> NA <NA> M 48 0165-526-80087
## 4: Nicole <NA> NA <NA> F 26 0163-266-89055
## 5: Aciano <NA> NA <NA> M 44 0163-680-95557
# Change the code to an inner join
students[guardians, on = .(name), nomatch=0L]
## Empty data.table (0 rows and 7 cols): name,sex,age,guardian,i.sex,i.age...
# What are the correct join key columns?
students[guardians, on = c("guardian"="name"), nomatch = 0L]
## name sex age guardian i.sex i.age phone
## 1: Kalvin M 17 John M 34 0163-745-07369
## 2: Elsa F 17 John M 34 0163-745-07369
## 3: Adara F 16 Kiana F 38 0163-875-41705
## 4: Cierra F 17 Kiana F 38 0163-875-41705
## 5: Caleb M 15 Tyler M 48 0165-526-80087
## 6: Makaela F 17 Nicole F 26 0163-266-89055
## 7: Yelena F 17 Aciano M 44 0163-680-95557
# Intentionally errors out due to type mismatch
# subjects[locations, on=c("class", "semester")]
# Structure
str(subjects)
## Classes 'data.table' and 'data.frame': 28 obs. of 3 variables:
## $ name : chr "Yelena" "Yelena" "Yelena" "Yelena" ...
## $ semester: int 1 1 2 2 1 1 2 2 1 1 ...
## $ class : chr "Mathematics" "Programming" "Language" "Art" ...
## - attr(*, ".internal.selfref")=<externalptr>
str(locations)
## Classes 'data.table' and 'data.frame': 16 obs. of 4 variables:
## $ class : chr "English" "Mathematics" "Art" "Programming" ...
## $ semester: int 1 1 1 1 1 1 1 1 2 2 ...
## $ building: chr "Block B" "Block C" "Block B" "Block A" ...
## $ room : chr "Room 103" "Room 104" "Room 102" "Room 102" ...
## - attr(*, ".internal.selfref")=<externalptr>
# Does semester have the same class?
same_class <- FALSE
# Fix the column class
locations[, semester := as.integer(semester)]
# Right join
subjects[locations, on=c("class", "semester")]
## name semester class building room
## 1: Adara 1 English Block B Room 103
## 2: Cierra 1 English Block B Room 103
## 3: Yelena 1 Mathematics Block C Room 104
## 4: Elsa 1 Mathematics Block C Room 104
## 5: <NA> 1 Art Block B Room 102
## 6: Yelena 1 Programming Block A Room 102
## 7: Makaela 1 Programming Block A Room 102
## 8: Caleb 1 Programming Block A Room 102
## 9: Kalvin 1 History Block B Room 101
## 10: Adara 1 Geography Block A Room 101
## 11: Caleb 1 Geography Block A Room 101
## 12: Elsa 1 Politics Block A Room 104
## 13: Makaela 1 Politics Block A Room 104
## 14: Kalvin 1 Language Block C Room 103
## 15: Cierra 1 Language Block C Room 103
## 16: <NA> 2 English Block A Room 104
## 17: Caleb 2 Mathematics Block A Room 102
## 18: Yelena 2 Art Block B Room 102
## 19: Cierra 2 Art Block B Room 102
## 20: Makaela 2 Programming Block C Room 104
## 21: Elsa 2 History Block A Room 103
## 22: Adara 2 Geography Block C Room 103
## 23: Caleb 2 Geography Block C Room 103
## 24: Kalvin 2 Politics Block C Room 102
## 25: Elsa 2 Politics Block C Room 102
## 26: Adara 2 Politics Block C Room 102
## 27: Cierra 2 Politics Block C Room 102
## 28: Yelena 2 Language Block B Room 103
## 29: Kalvin 2 Language Block B Room 103
## 30: Makaela 2 Language Block B Room 103
## name semester class building room
# Identify and set the keys
join_key <- c("subject"="class")
# Right join
teachers[locations, on=join_key]
## teacher subject semester building room
## 1: Mr. Marquez English 1 Block B Room 103
## 2: Ms. Schoon Mathematics 1 Block C Room 104
## 3: Ms. Harris Art 1 Block B Room 102
## 4: Ms. Homann Programming 1 Block A Room 102
## 5: Mr. Santoyo History 1 Block B Room 101
## 6: Mr. Carbajal Geography 1 Block A Room 101
## 7: Ms. Limitone Politics 1 Block A Room 104
## 8: Ms. Low Language 1 Block C Room 103
## 9: Mr. Marquez English 2 Block A Room 104
## 10: Ms. Schoon Mathematics 2 Block A Room 102
## 11: Ms. Harris Art 2 Block B Room 102
## 12: Ms. Homann Programming 2 Block C Room 104
## 13: Mr. Santoyo History 2 Block A Room 103
## 14: Mr. Carbajal Geography 2 Block C Room 103
## 15: Ms. Limitone Politics 2 Block C Room 102
## 16: Ms. Low Language 2 Block B Room 103
# Inner join 1
capital_pop <- merge(australia_capitals, australia_cities_top20, by="city", nomatch=0L)
# Inner join 2
merge(capital_pop, australia_area, by="state", suffixes=c(".pop", ".area"), nomatch=0L)
## state city country population percentage.pop area_km2
## 1: New South Wales Sydney Australia 5029768 0.2074 800641
## 2: Northern Territory Darwin Australia 145916 0.0060 1349129
## 3: Queensland Brisbane Australia 2360241 0.0974 1730647
## 4: South Australia Adelaide Australia 1324279 0.0550 983482
## 5: Tasmania Hobart Australia 224462 0.0092 68401
## 6: Victoria Melbourne Australia 4725316 0.1924 227416
## 7: Western Australia Perth Australia 2022044 0.0856 2529875
## percentage.area
## 1: 0.1041
## 2: 0.1754
## 3: 0.2250
## 4: 0.1279
## 5: 0.0089
## 6: 0.0296
## 7: 0.3289
netflixOrig <- fread("./RInputFiles/netflix_2017.csv", sep=",")
imdb <- fread("./RInputFiles/imdb_ratings.csv", sep=",")
netflix <- as.data.frame(netflixOrig)[, c("episodes", "release_date")]
rownames(netflix) <- netflixOrig$title
# Convert netflix to a data.table
netflix <- as.data.table(netflix, keep.rownames="series")
# Rename "title" to "series" in imdb
setnames(imdb, c("series", "rating"))
# Right join
imdb[netflix, on="series"]
## series rating episodes release_date
## 1: A Series of Unfortunate Events 7.9 8 13-Jan-17
## 2: 13 Reasons Why 8.4 13 31-Mar-17
## 3: Gypsy 7.0 10 30-Jun-17
## 4: Ozark 8.5 10 21-Jul-17
## 5: Mindhunter 8.9 10 13-Oct-17
## 6: Longmire NA 10 17-Nov-17
## 7: Godless NA 6 22-Nov-17
## 8: Dark NA 10 1-Dec-17
cardio <- fread("./RInputFiles/affymetrix_chd_genes.csv")
framingham <- fread("./RInputFiles/framingham_chd_genes.csv")
heart <- fread("./RInputFiles/illumina_chd_genes.csv")
# Try an inner join
merge(heart, cardio, by=c("gene"), allow.cartesian=TRUE)
## gene ilmn_probe change.x pvalue.x affy_probe change.y pvalue.y
## 1: ILMN_1772594 1.26 8.7e-06 PSR06035179.hg.1 1.16 3.4e-05
## 2: ILMN_1772594 1.26 8.7e-06 PSR01070675.hg.1 1.10 7.2e-05
## 3: ILMN_1772594 1.26 8.7e-06 JUC05011543.hg.1 1.09 8.3e-05
## 4: ILMN_1772594 1.26 8.7e-06 PSR08024619.hg.1 1.05 1.5e-04
## 5: ILMN_1772594 1.26 8.7e-06 PSR20002720.hg.1 1.04 1.6e-04
## 6: ILMN_3206475 1.16 3.3e-05 PSR06035179.hg.1 1.16 3.4e-05
## 7: ILMN_3206475 1.16 3.3e-05 PSR01070675.hg.1 1.10 7.2e-05
## 8: ILMN_3206475 1.16 3.3e-05 JUC05011543.hg.1 1.09 8.3e-05
## 9: ILMN_3206475 1.16 3.3e-05 PSR08024619.hg.1 1.05 1.5e-04
## 10: ILMN_3206475 1.16 3.3e-05 PSR20002720.hg.1 1.04 1.6e-04
## 11: ILMN_1689153 1.11 6.4e-05 PSR06035179.hg.1 1.16 3.4e-05
## 12: ILMN_1689153 1.11 6.4e-05 PSR01070675.hg.1 1.10 7.2e-05
## 13: ILMN_1689153 1.11 6.4e-05 JUC05011543.hg.1 1.09 8.3e-05
## 14: ILMN_1689153 1.11 6.4e-05 PSR08024619.hg.1 1.05 1.5e-04
## 15: ILMN_1689153 1.11 6.4e-05 PSR20002720.hg.1 1.04 1.6e-04
## 16: ILMN_3282983 1.10 7.3e-05 PSR06035179.hg.1 1.16 3.4e-05
## 17: ILMN_3282983 1.10 7.3e-05 PSR01070675.hg.1 1.10 7.2e-05
## 18: ILMN_3282983 1.10 7.3e-05 JUC05011543.hg.1 1.09 8.3e-05
## 19: ILMN_3282983 1.10 7.3e-05 PSR08024619.hg.1 1.05 1.5e-04
## 20: ILMN_3282983 1.10 7.3e-05 PSR20002720.hg.1 1.04 1.6e-04
## 21: ILMN_1708533 -1.07 1.1e-04 PSR06035179.hg.1 1.16 3.4e-05
## 22: ILMN_1708533 -1.07 1.1e-04 PSR01070675.hg.1 1.10 7.2e-05
## 23: ILMN_1708533 -1.07 1.1e-04 JUC05011543.hg.1 1.09 8.3e-05
## 24: ILMN_1708533 -1.07 1.1e-04 PSR08024619.hg.1 1.05 1.5e-04
## 25: ILMN_1708533 -1.07 1.1e-04 PSR20002720.hg.1 1.04 1.6e-04
## 26: ILMN_1723249 1.06 1.3e-04 PSR06035179.hg.1 1.16 3.4e-05
## 27: ILMN_1723249 1.06 1.3e-04 PSR01070675.hg.1 1.10 7.2e-05
## 28: ILMN_1723249 1.06 1.3e-04 JUC05011543.hg.1 1.09 8.3e-05
## 29: ILMN_1723249 1.06 1.3e-04 PSR08024619.hg.1 1.05 1.5e-04
## 30: ILMN_1723249 1.06 1.3e-04 PSR20002720.hg.1 1.04 1.6e-04
## 31: ILMN_3206111 1.06 1.4e-04 PSR06035179.hg.1 1.16 3.4e-05
## 32: ILMN_3206111 1.06 1.4e-04 PSR01070675.hg.1 1.10 7.2e-05
## 33: ILMN_3206111 1.06 1.4e-04 JUC05011543.hg.1 1.09 8.3e-05
## 34: ILMN_3206111 1.06 1.4e-04 PSR08024619.hg.1 1.05 1.5e-04
## 35: ILMN_3206111 1.06 1.4e-04 PSR20002720.hg.1 1.04 1.6e-04
## 36: ABCB10 ILMN_1665730 1.05 1.4e-04 JUC01031346.hg.1 -1.11 6.4e-05
## 37: ABCB10 ILMN_1665730 1.05 1.4e-04 JUC01031349.hg.1 -1.08 9.9e-05
## 38: ABCB10 ILMN_1665730 1.05 1.4e-04 JUC01031353.hg.1 1.07 1.1e-04
## 39: ABCB10 ILMN_1665730 1.05 1.4e-04 PSR01059893.hg.1 1.04 1.6e-04
## 40: CIB2 ILMN_1714489 1.19 2.4e-05 PSR15016581.hg.1 -1.05 1.5e-04
## 41: FABP4 ILMN_1773006 -1.09 9.2e-05 PSR08017600.hg.1 1.22 1.6e-05
## 42: FABP4 ILMN_1773006 -1.09 9.2e-05 PSR08017606.hg.1 1.20 2.0e-05
## 43: FOXD4L4 ILMN_3238576 1.10 7.6e-05 PSR09002806.hg.1 -1.15 3.9e-05
## 44: FOXD4L4 ILMN_3238576 1.10 7.6e-05 PSR09002801.hg.1 -1.11 6.4e-05
## 45: FOXD4L4 ILMN_3238576 1.10 7.6e-05 PSR09015126.hg.1 1.07 1.1e-04
## 46: FOXD4L4 ILMN_3238576 1.06 1.2e-04 PSR09002806.hg.1 -1.15 3.9e-05
## 47: FOXD4L4 ILMN_3238576 1.06 1.2e-04 PSR09002801.hg.1 -1.11 6.4e-05
## 48: FOXD4L4 ILMN_3238576 1.06 1.2e-04 PSR09015126.hg.1 1.07 1.1e-04
## 49: GYPB ILMN_1683093 1.15 4.1e-05 PSR04029411.hg.1 1.13 5.1e-05
## 50: GYPB ILMN_1683093 1.15 4.1e-05 PSR04029413.hg.1 1.12 6.1e-05
## 51: GZMB ILMN_2109489 1.10 8.0e-05 PSR14011271.hg.1 1.08 1.0e-04
## 52: ISCU ILMN_2409062 1.11 6.4e-05 JUC12005887.hg.1 -1.09 8.3e-05
## 53: ISCU ILMN_2409062 1.11 6.4e-05 JUC12005889.hg.1 1.05 1.5e-04
## 54: ISCU ILMN_1735432 1.08 1.0e-04 JUC12005887.hg.1 -1.09 8.3e-05
## 55: ISCU ILMN_1735432 1.08 1.0e-04 JUC12005889.hg.1 1.05 1.5e-04
## 56: SEPHS1 ILMN_2176768 -1.16 3.3e-05 PSR10013017.hg.1 -1.18 2.6e-05
## 57: SEPHS1 ILMN_2176768 -1.16 3.3e-05 JUC10007537.hg.1 1.12 6.0e-05
## 58: SEPHS1 ILMN_2176768 -1.16 3.3e-05 JUC10007538.hg.1 -1.11 6.5e-05
## 59: SEPHS1 ILMN_1673369 1.09 8.2e-05 PSR10013017.hg.1 -1.18 2.6e-05
## 60: SEPHS1 ILMN_1673369 1.09 8.2e-05 JUC10007537.hg.1 1.12 6.0e-05
## 61: SEPHS1 ILMN_1673369 1.09 8.2e-05 JUC10007538.hg.1 -1.11 6.5e-05
## 62: TSTA3 ILMN_1697777 1.09 9.1e-05 JUC08011706.hg.1 -1.14 4.2e-05
## 63: ZDHHC2 ILMN_1769783 1.04 1.6e-04 JUC08000597.hg.1 1.12 5.9e-05
## gene ilmn_probe change.x pvalue.x affy_probe change.y pvalue.y
# Filter missing values
heart_2 <- heart[!is.na(gene)]
cardio_2 <- cardio[!is.na(gene)]
# Repeat the inner join
merge(heart_2, cardio_2, by=c("gene"), allow.cartesian=TRUE)
## gene ilmn_probe change.x pvalue.x affy_probe change.y pvalue.y
## 1: ILMN_1772594 1.26 8.7e-06 PSR06035179.hg.1 1.16 3.4e-05
## 2: ILMN_1772594 1.26 8.7e-06 PSR01070675.hg.1 1.10 7.2e-05
## 3: ILMN_1772594 1.26 8.7e-06 JUC05011543.hg.1 1.09 8.3e-05
## 4: ILMN_1772594 1.26 8.7e-06 PSR08024619.hg.1 1.05 1.5e-04
## 5: ILMN_1772594 1.26 8.7e-06 PSR20002720.hg.1 1.04 1.6e-04
## 6: ILMN_3206475 1.16 3.3e-05 PSR06035179.hg.1 1.16 3.4e-05
## 7: ILMN_3206475 1.16 3.3e-05 PSR01070675.hg.1 1.10 7.2e-05
## 8: ILMN_3206475 1.16 3.3e-05 JUC05011543.hg.1 1.09 8.3e-05
## 9: ILMN_3206475 1.16 3.3e-05 PSR08024619.hg.1 1.05 1.5e-04
## 10: ILMN_3206475 1.16 3.3e-05 PSR20002720.hg.1 1.04 1.6e-04
## 11: ILMN_1689153 1.11 6.4e-05 PSR06035179.hg.1 1.16 3.4e-05
## 12: ILMN_1689153 1.11 6.4e-05 PSR01070675.hg.1 1.10 7.2e-05
## 13: ILMN_1689153 1.11 6.4e-05 JUC05011543.hg.1 1.09 8.3e-05
## 14: ILMN_1689153 1.11 6.4e-05 PSR08024619.hg.1 1.05 1.5e-04
## 15: ILMN_1689153 1.11 6.4e-05 PSR20002720.hg.1 1.04 1.6e-04
## 16: ILMN_3282983 1.10 7.3e-05 PSR06035179.hg.1 1.16 3.4e-05
## 17: ILMN_3282983 1.10 7.3e-05 PSR01070675.hg.1 1.10 7.2e-05
## 18: ILMN_3282983 1.10 7.3e-05 JUC05011543.hg.1 1.09 8.3e-05
## 19: ILMN_3282983 1.10 7.3e-05 PSR08024619.hg.1 1.05 1.5e-04
## 20: ILMN_3282983 1.10 7.3e-05 PSR20002720.hg.1 1.04 1.6e-04
## 21: ILMN_1708533 -1.07 1.1e-04 PSR06035179.hg.1 1.16 3.4e-05
## 22: ILMN_1708533 -1.07 1.1e-04 PSR01070675.hg.1 1.10 7.2e-05
## 23: ILMN_1708533 -1.07 1.1e-04 JUC05011543.hg.1 1.09 8.3e-05
## 24: ILMN_1708533 -1.07 1.1e-04 PSR08024619.hg.1 1.05 1.5e-04
## 25: ILMN_1708533 -1.07 1.1e-04 PSR20002720.hg.1 1.04 1.6e-04
## 26: ILMN_1723249 1.06 1.3e-04 PSR06035179.hg.1 1.16 3.4e-05
## 27: ILMN_1723249 1.06 1.3e-04 PSR01070675.hg.1 1.10 7.2e-05
## 28: ILMN_1723249 1.06 1.3e-04 JUC05011543.hg.1 1.09 8.3e-05
## 29: ILMN_1723249 1.06 1.3e-04 PSR08024619.hg.1 1.05 1.5e-04
## 30: ILMN_1723249 1.06 1.3e-04 PSR20002720.hg.1 1.04 1.6e-04
## 31: ILMN_3206111 1.06 1.4e-04 PSR06035179.hg.1 1.16 3.4e-05
## 32: ILMN_3206111 1.06 1.4e-04 PSR01070675.hg.1 1.10 7.2e-05
## 33: ILMN_3206111 1.06 1.4e-04 JUC05011543.hg.1 1.09 8.3e-05
## 34: ILMN_3206111 1.06 1.4e-04 PSR08024619.hg.1 1.05 1.5e-04
## 35: ILMN_3206111 1.06 1.4e-04 PSR20002720.hg.1 1.04 1.6e-04
## 36: ABCB10 ILMN_1665730 1.05 1.4e-04 JUC01031346.hg.1 -1.11 6.4e-05
## 37: ABCB10 ILMN_1665730 1.05 1.4e-04 JUC01031349.hg.1 -1.08 9.9e-05
## 38: ABCB10 ILMN_1665730 1.05 1.4e-04 JUC01031353.hg.1 1.07 1.1e-04
## 39: ABCB10 ILMN_1665730 1.05 1.4e-04 PSR01059893.hg.1 1.04 1.6e-04
## 40: CIB2 ILMN_1714489 1.19 2.4e-05 PSR15016581.hg.1 -1.05 1.5e-04
## 41: FABP4 ILMN_1773006 -1.09 9.2e-05 PSR08017600.hg.1 1.22 1.6e-05
## 42: FABP4 ILMN_1773006 -1.09 9.2e-05 PSR08017606.hg.1 1.20 2.0e-05
## 43: FOXD4L4 ILMN_3238576 1.10 7.6e-05 PSR09002806.hg.1 -1.15 3.9e-05
## 44: FOXD4L4 ILMN_3238576 1.10 7.6e-05 PSR09002801.hg.1 -1.11 6.4e-05
## 45: FOXD4L4 ILMN_3238576 1.10 7.6e-05 PSR09015126.hg.1 1.07 1.1e-04
## 46: FOXD4L4 ILMN_3238576 1.06 1.2e-04 PSR09002806.hg.1 -1.15 3.9e-05
## 47: FOXD4L4 ILMN_3238576 1.06 1.2e-04 PSR09002801.hg.1 -1.11 6.4e-05
## 48: FOXD4L4 ILMN_3238576 1.06 1.2e-04 PSR09015126.hg.1 1.07 1.1e-04
## 49: GYPB ILMN_1683093 1.15 4.1e-05 PSR04029411.hg.1 1.13 5.1e-05
## 50: GYPB ILMN_1683093 1.15 4.1e-05 PSR04029413.hg.1 1.12 6.1e-05
## 51: GZMB ILMN_2109489 1.10 8.0e-05 PSR14011271.hg.1 1.08 1.0e-04
## 52: ISCU ILMN_2409062 1.11 6.4e-05 JUC12005887.hg.1 -1.09 8.3e-05
## 53: ISCU ILMN_2409062 1.11 6.4e-05 JUC12005889.hg.1 1.05 1.5e-04
## 54: ISCU ILMN_1735432 1.08 1.0e-04 JUC12005887.hg.1 -1.09 8.3e-05
## 55: ISCU ILMN_1735432 1.08 1.0e-04 JUC12005889.hg.1 1.05 1.5e-04
## 56: SEPHS1 ILMN_2176768 -1.16 3.3e-05 PSR10013017.hg.1 -1.18 2.6e-05
## 57: SEPHS1 ILMN_2176768 -1.16 3.3e-05 JUC10007537.hg.1 1.12 6.0e-05
## 58: SEPHS1 ILMN_2176768 -1.16 3.3e-05 JUC10007538.hg.1 -1.11 6.5e-05
## 59: SEPHS1 ILMN_1673369 1.09 8.2e-05 PSR10013017.hg.1 -1.18 2.6e-05
## 60: SEPHS1 ILMN_1673369 1.09 8.2e-05 JUC10007537.hg.1 1.12 6.0e-05
## 61: SEPHS1 ILMN_1673369 1.09 8.2e-05 JUC10007538.hg.1 -1.11 6.5e-05
## 62: TSTA3 ILMN_1697777 1.09 9.1e-05 JUC08011706.hg.1 -1.14 4.2e-05
## 63: ZDHHC2 ILMN_1769783 1.04 1.6e-04 JUC08000597.hg.1 1.12 5.9e-05
## gene ilmn_probe change.x pvalue.x affy_probe change.y pvalue.y
# Keep only the last probe for each gene
heart_3 <- unique(heart_2, by="gene", fromLast=TRUE)
cardio_3 <- unique(cardio_2, by="gene", fromLast=TRUE)
# Inner join
reproducible <- merge(heart_3, cardio_3, by="gene", suffixes=c(".heart", ".cardio"))
reproducible
## gene ilmn_probe change.heart pvalue.heart affy_probe
## 1: ILMN_3206111 1.06 1.4e-04 PSR20002720.hg.1
## 2: ABCB10 ILMN_1665730 1.05 1.4e-04 PSR01059893.hg.1
## 3: CIB2 ILMN_1714489 1.19 2.4e-05 PSR15016581.hg.1
## 4: FABP4 ILMN_1773006 -1.09 9.2e-05 PSR08017606.hg.1
## 5: FOXD4L4 ILMN_3238576 1.06 1.2e-04 PSR09015126.hg.1
## 6: GYPB ILMN_1683093 1.15 4.1e-05 PSR04029413.hg.1
## 7: GZMB ILMN_2109489 1.10 8.0e-05 PSR14011271.hg.1
## 8: ISCU ILMN_1735432 1.08 1.0e-04 JUC12005889.hg.1
## 9: SEPHS1 ILMN_1673369 1.09 8.2e-05 JUC10007538.hg.1
## 10: TSTA3 ILMN_1697777 1.09 9.1e-05 JUC08011706.hg.1
## 11: ZDHHC2 ILMN_1769783 1.04 1.6e-04 JUC08000597.hg.1
## change.cardio pvalue.cardio
## 1: 1.04 1.6e-04
## 2: 1.04 1.6e-04
## 3: -1.05 1.5e-04
## 4: 1.20 2.0e-05
## 5: 1.07 1.1e-04
## 6: 1.12 6.1e-05
## 7: 1.08 1.0e-04
## 8: 1.05 1.5e-04
## 9: -1.11 6.5e-05
## 10: -1.14 4.2e-05
## 11: 1.12 5.9e-05
# Right join taking the first match
heart_2[framingham, on="gene", mult="first"]
## ilmn_probe gene change pvalue i.change i.pvalue
## 1: <NA> SGIP1 NA NA 1.06 8.1e-06
## 2: ILMN_2109489 GZMB 1.10 8.0e-05 -1.14 1.4e-05
## 3: <NA> SLC7A11 NA NA 1.09 2.8e-05
## 4: ILMN_1725594 FAM188A -1.18 2.8e-05 1.07 3.2e-05
## 5: <NA> CCDC144B/CCDC144A NA NA 1.26 3.8e-05
## 6: <NA> TMTC2 NA NA 1.07 4.7e-05
## 7: ILMN_1712400 SERPINB6 1.17 3.0e-05 -1.06 1.5e-04
## 8: ILMN_2344204 PRR13 1.14 4.6e-05 1.11 1.8e-04
## 9: <NA> TMEM56 NA NA 1.12 2.3e-04
## 10: <NA> C20orf20 NA NA 1.08 2.9e-04
## 11: <NA> GATS NA NA -1.05 3.1e-04
## 12: ILMN_1665730 ABCB10 1.05 1.4e-04 1.09 3.5e-04
## 13: ILMN_2409062 ISCU 1.11 6.4e-05 1.08 3.6e-04
## 14: <NA> PAGE1 NA NA 1.08 4.0e-04
## 15: ILMN_1769783 ZDHHC2 1.04 1.6e-04 1.10 4.0e-04
## 16: ILMN_1714489 CIB2 1.19 2.4e-05 -1.07 4.1e-04
## 17: <NA> CRYGB NA NA -1.08 4.4e-04
## 18: ILMN_1773006 FABP4 -1.09 9.2e-05 1.09 4.7e-04
## 19: <NA> FIS1 NA NA 1.11 5.5e-04
## 20: ILMN_1657680 CCDC69 -1.05 1.5e-04 -1.05 5.5e-04
## 21: ILMN_1683093 GYPB 1.15 4.1e-05 1.12 5.8e-04
## 22: <NA> DLEU2 NA NA 1.09 6.6e-04
## 23: <NA> RBM24 NA NA 1.05 6.9e-04
## 24: <NA> HBM NA NA 1.11 7.6e-04
## 25: ILMN_1697777 TSTA3 1.09 9.1e-05 1.11 7.7e-04
## 26: ILMN_1749930 TMEM48 -1.06 1.3e-04 1.06 8.0e-04
## 27: <NA> GABARAPL2 NA NA 1.08 8.0e-04
## 28: <NA> MAPRE2 NA NA -1.05 8.1e-04
## 29: ILMN_1794306 USP28 -1.10 7.3e-05 -1.08 8.1e-04
## 30: <NA> DAD1 NA NA 1.05 8.4e-04
## 31: ILMN_1793410 SNTB1 1.23 1.2e-05 1.04 8.5e-04
## 32: <NA> STX1B NA NA 1.07 8.6e-04
## 33: <NA> PRPF3 NA NA -1.05 8.8e-04
## 34: <NA> GUK1 NA NA 1.12 9.1e-04
## 35: <NA> CHPT1 NA NA 1.09 9.7e-04
## ilmn_probe gene change pvalue i.change i.pvalue
# Anti-join
reproducible[!framingham, on="gene"]
## gene ilmn_probe change.heart pvalue.heart affy_probe
## 1: ILMN_3206111 1.06 1.4e-04 PSR20002720.hg.1
## 2: FOXD4L4 ILMN_3238576 1.06 1.2e-04 PSR09015126.hg.1
## 3: SEPHS1 ILMN_1673369 1.09 8.2e-05 JUC10007538.hg.1
## change.cardio pvalue.cardio
## 1: 1.04 1.6e-04
## 2: 1.07 1.1e-04
## 3: -1.11 6.5e-05
Chapter 4 - Concatenating and Reshaping data.table
Concatenating two or more data.table:
Set operations:
Melting data.tables:
Casting data.tables:
Example code includes:
ebola_W50 <- fread("./RInputFiles/ebola_2014_W50.csv")
ebola_W51 <- fread("./RInputFiles/ebola_2014_W51.csv")
ebola_W52 <- fread("./RInputFiles/ebola_2014_W52.csv")
# Concatenate case numbers from weeks 50 and 51
rbind(ebola_W50, ebola_W51)
## Location period_code period_start period_end Confirmed
## 1: CONAKRY 2014-W50 08 December 2014 14 December 2014 37
## 2: COYAH 2014-W50 08 December 2014 14 December 2014 31
## 3: DABOLA 2014-W50 08 December 2014 14 December 2014 2
## 4: DUBREKA 2014-W50 08 December 2014 14 December 2014 6
## 5: FARANAH 2014-W50 08 December 2014 14 December 2014 0
## 6: FORECARIAH 2014-W50 08 December 2014 14 December 2014 22
## 7: GUECKEDOU 2014-W50 08 December 2014 14 December 2014 3
## 8: KANKAN 2014-W50 08 December 2014 14 December 2014 2
## 9: KEROUANE 2014-W50 08 December 2014 14 December 2014 9
## 10: KINDIA 2014-W50 08 December 2014 14 December 2014 1
## 11: KISSIDOUGOU 2014-W50 08 December 2014 14 December 2014 2
## 12: KOUROUSSA 2014-W50 08 December 2014 14 December 2014 3
## 13: LOLA 2014-W50 08 December 2014 14 December 2014 4
## 14: MACENTA 2014-W50 08 December 2014 14 December 2014 6
## 15: N'ZEREKORE 2014-W50 08 December 2014 14 December 2014 13
## 16: SIGUIRI 2014-W50 08 December 2014 14 December 2014 0
## 17: TELIMELE 2014-W50 08 December 2014 14 December 2014 10
## 18: BEYLA 2014-W51 15 December 2014 21 December 2014 3
## 19: CONAKRY 2014-W51 15 December 2014 21 December 2014 39
## 20: COYAH 2014-W51 15 December 2014 21 December 2014 35
## 21: DABOLA 2014-W51 15 December 2014 21 December 2014 1
## 22: DUBREKA 2014-W51 15 December 2014 21 December 2014 5
## 23: FORECARIAH 2014-W51 15 December 2014 21 December 2014 15
## 24: FRIA 2014-W51 15 December 2014 21 December 2014 1
## 25: GUECKEDOU 2014-W51 15 December 2014 21 December 2014 7
## 26: KANKAN 2014-W51 15 December 2014 21 December 2014 2
## 27: KEROUANE 2014-W51 15 December 2014 21 December 2014 17
## 28: KINDIA 2014-W51 15 December 2014 21 December 2014 21
## 29: KISSIDOUGOU 2014-W51 15 December 2014 21 December 2014 117
## 30: LOLA 2014-W51 15 December 2014 21 December 2014 14
## 31: MACENTA 2014-W51 15 December 2014 21 December 2014 24
## 32: N'ZEREKORE 2014-W51 15 December 2014 21 December 2014 18
## 33: TELIMELE 2014-W51 15 December 2014 21 December 2014 6
## Location period_code period_start period_end Confirmed
## Probable
## 1: 6
## 2: 5
## 3: 0
## 4: 3
## 5: 14
## 6: 1
## 7: 0
## 8: 0
## 9: 0
## 10: 24
## 11: 0
## 12: 0
## 13: 0
## 14: 0
## 15: 0
## 16: 4
## 17: 0
## 18: 0
## 19: 6
## 20: 0
## 21: 0
## 22: 1
## 23: 0
## 24: 0
## 25: 0
## 26: 0
## 27: 0
## 28: 6
## 29: 31
## 30: 1
## 31: 2
## 32: 0
## 33: 0
## Probable
# Intentionally throws an error
# Concatenate case numbers from all three weeks
# rbind(ebola_W50, ebola_W51, ebola_W52)
# Modify the code
rbind(ebola_W50, ebola_W51, ebola_W52, fill=TRUE)
## Location period_code period_start period_end Confirmed
## 1: CONAKRY 2014-W50 08 December 2014 14 December 2014 37
## 2: COYAH 2014-W50 08 December 2014 14 December 2014 31
## 3: DABOLA 2014-W50 08 December 2014 14 December 2014 2
## 4: DUBREKA 2014-W50 08 December 2014 14 December 2014 6
## 5: FARANAH 2014-W50 08 December 2014 14 December 2014 0
## 6: FORECARIAH 2014-W50 08 December 2014 14 December 2014 22
## 7: GUECKEDOU 2014-W50 08 December 2014 14 December 2014 3
## 8: KANKAN 2014-W50 08 December 2014 14 December 2014 2
## 9: KEROUANE 2014-W50 08 December 2014 14 December 2014 9
## 10: KINDIA 2014-W50 08 December 2014 14 December 2014 1
## 11: KISSIDOUGOU 2014-W50 08 December 2014 14 December 2014 2
## 12: KOUROUSSA 2014-W50 08 December 2014 14 December 2014 3
## 13: LOLA 2014-W50 08 December 2014 14 December 2014 4
## 14: MACENTA 2014-W50 08 December 2014 14 December 2014 6
## 15: N'ZEREKORE 2014-W50 08 December 2014 14 December 2014 13
## 16: SIGUIRI 2014-W50 08 December 2014 14 December 2014 0
## 17: TELIMELE 2014-W50 08 December 2014 14 December 2014 10
## 18: BEYLA 2014-W51 15 December 2014 21 December 2014 3
## 19: CONAKRY 2014-W51 15 December 2014 21 December 2014 39
## 20: COYAH 2014-W51 15 December 2014 21 December 2014 35
## 21: DABOLA 2014-W51 15 December 2014 21 December 2014 1
## 22: DUBREKA 2014-W51 15 December 2014 21 December 2014 5
## 23: FORECARIAH 2014-W51 15 December 2014 21 December 2014 15
## 24: FRIA 2014-W51 15 December 2014 21 December 2014 1
## 25: GUECKEDOU 2014-W51 15 December 2014 21 December 2014 7
## 26: KANKAN 2014-W51 15 December 2014 21 December 2014 2
## 27: KEROUANE 2014-W51 15 December 2014 21 December 2014 17
## 28: KINDIA 2014-W51 15 December 2014 21 December 2014 21
## 29: KISSIDOUGOU 2014-W51 15 December 2014 21 December 2014 117
## 30: LOLA 2014-W51 15 December 2014 21 December 2014 14
## 31: MACENTA 2014-W51 15 December 2014 21 December 2014 24
## 32: N'ZEREKORE 2014-W51 15 December 2014 21 December 2014 18
## 33: TELIMELE 2014-W51 15 December 2014 21 December 2014 6
## 34: BEYLA 2014-W52 22 December 2014 28 December 2014 1
## 35: CONAKRY 2014-W52 22 December 2014 28 December 2014 46
## 36: COYAH 2014-W52 22 December 2014 28 December 2014 23
## 37: DABOLA 2014-W52 22 December 2014 28 December 2014 2
## 38: DUBREKA 2014-W52 22 December 2014 28 December 2014 36
## 39: FARANAH 2014-W52 22 December 2014 28 December 2014 4
## 40: FORECARIAH 2014-W52 22 December 2014 28 December 2014 2
## 41: GUECKEDOU 2014-W52 22 December 2014 28 December 2014 1
## 42: KANKAN 2014-W52 22 December 2014 28 December 2014 3
## 43: KEROUANE 2014-W52 22 December 2014 28 December 2014 4
## 44: KINDIA 2014-W52 22 December 2014 28 December 2014 31
## 45: KISSIDOUGOU 2014-W52 22 December 2014 28 December 2014 26
## 46: KOUROUSSA 2014-W52 22 December 2014 28 December 2014 1
## 47: LOLA 2014-W52 22 December 2014 28 December 2014 17
## 48: MACENTA 2014-W52 22 December 2014 28 December 2014 4
## 49: N'ZEREKORE 2014-W52 22 December 2014 28 December 2014 5
## 50: SIGUIRI 2014-W52 22 December 2014 28 December 2014 2
## 51: TELIMELE 2014-W52 22 December 2014 28 December 2014 10
## Location period_code period_start period_end Confirmed
## Probable
## 1: 6
## 2: 5
## 3: 0
## 4: 3
## 5: 14
## 6: 1
## 7: 0
## 8: 0
## 9: 0
## 10: 24
## 11: 0
## 12: 0
## 13: 0
## 14: 0
## 15: 0
## 16: 4
## 17: 0
## 18: 0
## 19: 6
## 20: 0
## 21: 0
## 22: 1
## 23: 0
## 24: 0
## 25: 0
## 26: 0
## 27: 0
## 28: 6
## 29: 31
## 30: 1
## 31: 2
## 32: 0
## 33: 0
## 34: NA
## 35: NA
## 36: NA
## 37: NA
## 38: NA
## 39: NA
## 40: NA
## 41: NA
## 42: NA
## 43: NA
## 44: NA
## 45: NA
## 46: NA
## 47: NA
## 48: NA
## 49: NA
## 50: NA
## 51: NA
## Probable
gdp_africa <- fread("./RInputFiles/gdp_africa_2000.csv")
gdp_asia <- fread("./RInputFiles/gdp_asia_2000.csv")
gdp_europe <- fread("./RInputFiles/gdp_europe_2000.csv")
gdp_north_america <- fread("./RInputFiles/gdp_north_america_2000.csv")
gdp_oceania <- fread("./RInputFiles/gdp_oceania_2000.csv")
gdp_south_america <- fread("./RInputFiles/gdp_south_america_2000.csv")
gdp <- list(africa=gdp_africa, asia=gdp_asia, europe=gdp_europe,
north_america=gdp_north_america, oceania=gdp_oceania, south_america=gdp_south_america
)
# Concatenate its data.tables
gdp_all_1 <- rbindlist(gdp)
# Concatenate its data.tables
gdp_all_2 <- rbindlist(gdp, idcol="continent")
str(gdp_all_2)
## Classes 'data.table' and 'data.frame': 207 obs. of 6 variables:
## $ continent : chr "africa" "africa" "africa" "africa" ...
## $ country : chr "Algeria" "Angola" "Benin" "Botswana" ...
## $ year : int 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
## $ gdp_per_capita: num 1794 298 346 3204 212 ...
## $ population : int 31183658 15058638 6949366 1736579 11607944 6767073 15927713 438737 3726048 8343321 ...
## $ total_gdp : num 5.60e+10 4.49e+09 2.40e+09 5.56e+09 2.47e+09 ...
## - attr(*, ".internal.selfref")=<externalptr>
gdp_all_2[95:105]
## continent country year gdp_per_capita population total_gdp
## 1: asia Turkmenistan 2000 645.2771 4501419 2904662605
## 2: asia United Arab Emirates 2000 34395.1491 3050128 104909607426
## 3: asia Uzbekistan 2000 558.2211 24518222 13686589934
## 4: asia Vietnam 2000 401.5478 80285563 32238491363
## 5: asia West Bank and Gaza 2000 1369.1930 3223781 4413978466
## 6: asia Yemen 2000 543.7139 17795219 9675507618
## 7: europe Albania 2000 1200.1374 3121965 3746787074
## 8: europe Andorra 2000 17539.4420 65399 1147061967
## 9: europe Armenia 2000 621.4248 3076098 1911563665
## 10: europe Austria 2000 23974.1831 8050884 193013366891
## 11: europe Azerbaijan 2000 655.0974 8117742 5317911943
# Fix the problem
gdp_all_3 <- rbindlist(gdp, idcol = "continent", use.names=TRUE)
gdp_all_3
## continent country year gdp_per_capita population total_gdp
## 1: africa Algeria 2000 1794.4052 31183658 55956119099
## 2: africa Angola 2000 298.4058 15058638 4493585565
## 3: africa Benin 2000 345.9504 6949366 2404135636
## 4: africa Botswana 2000 3204.1155 1736579 5564199661
## 5: africa Burkina Faso 2000 212.3754 11607944 2465241592
## ---
## 203: south_america Paraguay 2000 1323.4819 5302703 7018031622
## 204: south_america Peru 2000 2060.5763 25914875 53399576128
## 205: south_america Suriname 2000 1911.0463 480751 918737427
## 206: south_america Uruguay 2000 6914.3626 3321242 22964271411
## 207: south_america Venezuela 2000 4818.7082 24481477 117969093478
# Obtain countries in both Asia and Europe
fintersect(gdp$europe, gdp$asia)
## country year gdp_per_capita population total_gdp
## 1: Armenia 2000 621.4248 3076098 1911563665
## 2: Azerbaijan 2000 655.0974 8117742 5317911943
## 3: Cyprus 2000 13421.6554 943287 12660473076
## 4: Georgia 2000 691.9977 4743591 3282554086
## 5: Kazakhstan 2000 1229.0010 14956769 18381883430
## 6: Russia 2000 1775.1413 146400951 259882373162
## 7: Turkey 2000 4189.4781 63240157 264943250389
# Concatenate all data tables
gdp_all <- rbindlist(gdp, use.names=TRUE)
# Find all countries that span multiple continents
gdp_all[duplicated(gdp_all)]
## country year gdp_per_capita population total_gdp
## 1: Armenia 2000 621.4248 3076098 1911563665
## 2: Azerbaijan 2000 655.0974 8117742 5317911943
## 3: Cyprus 2000 13421.6554 943287 12660473076
## 4: Georgia 2000 691.9977 4743591 3282554086
## 5: Kazakhstan 2000 1229.0010 14956769 18381883430
## 6: Russia 2000 1775.1413 146400951 259882373162
## 7: Turkey 2000 4189.4781 63240157 264943250389
# Get all countries in either Asia or Europe
funion(gdp$europe, gdp$asia)
## country year gdp_per_capita population total_gdp
## 1: Albania 2000 1200.1374 3121965 3.746787e+09
## 2: Andorra 2000 17539.4420 65399 1.147062e+09
## 3: Armenia 2000 621.4248 3076098 1.911564e+09
## 4: Austria 2000 23974.1831 8050884 1.930134e+11
## 5: Azerbaijan 2000 655.0974 8117742 5.317912e+09
## 6: Belarus 2000 1273.0491 9952055 1.266945e+10
## 7: Belgium 2000 22697.0123 10268380 2.330615e+11
## 8: Bosnia and Herzegovina 2000 1490.6428 3792878 5.653826e+09
## 9: Bulgaria 2000 1579.3482 8000510 1.263559e+10
## 10: Channel Islands 2000 44310.9023 148725 6.590139e+09
## 11: Croatia 2000 4861.6775 4428069 2.152784e+10
## 12: Cyprus 2000 13421.6554 943287 1.266047e+10
## 13: Czech Republic 2000 5724.8375 10263010 5.875406e+10
## 14: Denmark 2000 29980.1555 5338283 1.600426e+11
## 15: Estonia 2000 4143.9272 1399145 5.797955e+09
## 16: Faeroe Islands 2000 23224.0986 46491 1.079712e+09
## 17: Finland 2000 23529.5385 5176482 1.218002e+11
## 18: France 2000 21774.9930 59387183 1.293155e+12
## 19: Georgia 2000 691.9977 4743591 3.282554e+09
## 20: Germany 2000 22945.7088 81895925 1.879160e+12
## 21: Greece 2000 11396.2326 10954032 1.248347e+11
## 22: Hungary 2000 4542.7207 10224113 4.644529e+10
## 23: Iceland 2000 30928.6756 281214 8.697577e+09
## 24: Ireland 2000 25629.6501 3841574 9.845820e+10
## 25: Isle of Man 2000 20359.4625 76806 1.563729e+09
## 26: Italy 2000 19388.2788 57147081 1.107984e+12
## 27: Kazakhstan 2000 1229.0010 14956769 1.838188e+10
## 28: Kosovo 2000 1087.7624 2608347 2.837262e+09
## 29: Latvia 2000 3300.9347 2371481 7.828104e+09
## 30: Liechtenstein 2000 75606.2032 33282 2.516326e+09
## 31: Lithuania 2000 3267.3474 3486373 1.139119e+10
## 32: Luxembourg 2000 46453.2458 436107 2.025859e+10
## 33: Macedonia, FYR 2000 1785.3268 2012051 3.592169e+09
## 34: Malta 2000 10377.0373 387180 4.017781e+09
## 35: Moldova 2000 354.0017 4201088 1.487192e+09
## 36: Monaco 2000 75382.4466 32081 2.418344e+09
## 37: Montenegro 2000 1555.9125 613557 9.546410e+08
## 38: Netherlands 2000 24179.7314 15894016 3.843130e+11
## 39: Norway 2000 37472.6717 4491572 1.683112e+11
## 40: Poland 2000 4454.0802 38486305 1.714211e+11
## 41: Portugal 2000 11470.8979 10278542 1.179041e+11
## 42: Romania 2000 1650.9684 22128139 3.653286e+10
## 43: Russia 2000 1775.1413 146400951 2.598824e+11
## 44: San Marino 2000 28698.3217 27420 7.869080e+08
## 45: Serbia 2000 809.2751 9463306 7.658418e+09
## 46: Slovak Republic 2000 5330.4016 5386065 2.870989e+10
## 47: Slovenia 2000 10045.3601 1988652 1.997673e+10
## 48: Spain 2000 14413.7889 40749800 5.873590e+11
## 49: Sweden 2000 27869.3776 8872284 2.472650e+11
## 50: Switzerland 2000 35639.4789 7165581 2.553776e+11
## 51: Turkey 2000 4189.4781 63240157 2.649433e+11
## 52: Ukraine 2000 635.7090 48746269 3.098844e+10
## 53: United Kingdom 2000 25057.6135 58867004 1.475067e+12
## 54: Bahrain 2000 12489.4677 666855 8.328664e+09
## 55: Bangladesh 2000 363.6399 131280739 4.773892e+10
## 56: Bhutan 2000 768.7510 564187 4.337193e+08
## 57: Brunei 2000 18350.1306 330554 6.065709e+09
## 58: Cambodia 2000 293.5685 12197905 3.580920e+09
## 59: China 2000 949.1781 1269974572 1.205432e+12
## 60: Hong Kong, China 2000 25756.6638 6783502 1.747204e+11
## 61: India 2000 450.4151 1053481072 4.745038e+11
## 62: Indonesia 2000 773.3110 211540428 1.635865e+11
## 63: Iran 2000 1550.0906 65850062 1.020736e+11
## 64: Iraq 2000 1063.4815 23574751 2.507131e+10
## 65: Israel 2000 19859.3021 6013711 1.194281e+11
## 66: Japan 2000 37291.7062 125714674 4.688115e+12
## 67: Jordan 2000 1764.2299 4767476 8.410924e+09
## 68: Kuwait 2000 19434.4000 1929470 3.749809e+10
## 69: Kyrgyz Republic 2000 279.6203 4954850 1.385477e+09
## 70: Lao 2000 325.5931 5342879 1.739605e+09
## 71: Lebanon 2000 4612.1987 3235380 1.492222e+10
## 72: Macao, China 2000 14128.8752 431907 6.102360e+09
## 73: Malaysia 2000 4005.5563 23420751 9.381314e+10
## 74: Maldives 2000 2284.9740 280384 6.406701e+08
## 75: Mongolia 2000 471.4733 2397438 1.130328e+09
## 76: Nepal 2000 225.1687 23740145 5.345537e+09
## 77: Oman 2000 8774.9338 2239403 1.965061e+10
## 78: Pakistan 2000 511.7026 138250487 7.074313e+10
## 79: Philippines 2000 1048.0705 77932247 8.167849e+10
## 80: Qatar 2000 30052.7612 593453 1.783490e+10
## 81: Saudi Arabia 2000 9400.8117 21392273 2.011047e+11
## 82: Singapore 2000 23814.5566 3918183 9.330979e+10
## 83: South Korea 2000 11346.6650 46206271 5.242871e+11
## 84: Sri Lanka 2000 854.9267 18783745 1.605873e+10
## 85: Syria 2000 1208.7346 16354050 1.976771e+10
## 86: Tajikistan 2000 139.4099 6186152 8.624107e+08
## 87: Thailand 2000 1943.2379 62693322 1.218280e+11
## 88: Timor-Leste 2000 380.9230 847185 3.227123e+08
## 89: Turkmenistan 2000 645.2771 4501419 2.904663e+09
## 90: United Arab Emirates 2000 34395.1491 3050128 1.049096e+11
## 91: Uzbekistan 2000 558.2211 24518222 1.368659e+10
## 92: Vietnam 2000 401.5478 80285563 3.223849e+10
## 93: West Bank and Gaza 2000 1369.1930 3223781 4.413978e+09
## 94: Yemen 2000 543.7139 17795219 9.675508e+09
## country year gdp_per_capita population total_gdp
# Concatenate all data tables
gdp_all <- rbindlist(gdp, use.names=TRUE)
# Print all unique countries
unique(gdp_all)
## country year gdp_per_capita population total_gdp
## 1: Algeria 2000 1794.4052 31183658 55956119099
## 2: Angola 2000 298.4058 15058638 4493585565
## 3: Benin 2000 345.9504 6949366 2404135636
## 4: Botswana 2000 3204.1155 1736579 5564199661
## 5: Burkina Faso 2000 212.3754 11607944 2465241592
## ---
## 196: Paraguay 2000 1323.4819 5302703 7018031622
## 197: Peru 2000 2060.5763 25914875 53399576128
## 198: Suriname 2000 1911.0463 480751 918737427
## 199: Uruguay 2000 6914.3626 3321242 22964271411
## 200: Venezuela 2000 4818.7082 24481477 117969093478
gdp_middle_east <- fread("./RInputFiles/gdp_middle_east_2000.csv")
# Which countries are in Africa but not considered part of the middle east?
fsetdiff(gdp$africa, gdp_middle_east)
## country year gdp_per_capita population total_gdp
## 1: Algeria 2000 1794.40523 31183658 55956119099
## 2: Angola 2000 298.40584 15058638 4493585565
## 3: Benin 2000 345.95036 6949366 2404135636
## 4: Botswana 2000 3204.11548 1736579 5564199661
## 5: Burkina Faso 2000 212.37539 11607944 2465241592
## 6: Burundi 2000 131.04633 6767073 886800109
## 7: Cameroon 2000 592.37200 15927713 9435131210
## 8: Cape Verde 2000 1233.25752 438737 541075706
## 9: Central African Republic 2000 247.02818 3726048 920438872
## 10: Chad 2000 168.45000 8343321 1405432403
## 11: Comoros 2000 358.95291 547696 196597072
## 12: Congo, Dem. Rep. 2000 86.75451 48048664 4168438144
## 13: Congo, Rep. 2000 1026.83155 3109269 3192695522
## 14: Cote d'Ivoire 2000 628.22811 16517948 10377039332
## 15: Djibouti 2000 753.11965 722562 544175637
## 16: Equatorial Guinea 2000 2388.34863 530896 1267964732
## 17: Eritrea 2000 172.75716 3535156 610723497
## 18: Ethiopia 2000 123.68092 66443603 8217805735
## 19: Gabon 2000 4102.62460 1231548 5052579123
## 20: Gambia 2000 603.59535 1228863 741735988
## 21: Ghana 2000 259.99069 18824994 4894323262
## 22: Guinea 2000 358.96291 8799165 3158573866
## 23: Guinea-Bissau 2000 173.66269 1315455 228445460
## 24: Kenya 2000 406.52306 31065820 12628972195
## 25: Lesotho 2000 379.77562 1856225 704949005
## 26: Liberia 2000 185.81340 2891968 537366418
## 27: Libya 2000 6479.71252 5337264 34583936374
## 28: Madagascar 2000 252.38252 15744811 3973715025
## 29: Malawi 2000 155.27154 11193230 1737990110
## 30: Mali 2000 214.46659 11046926 2369196569
## 31: Mauritania 2000 489.51164 2711421 1327272154
## 32: Mauritius 2000 3861.03854 1185143 4575882801
## 33: Morocco 2000 1271.81109 28950553 36819634425
## 34: Mozambique 2000 233.43927 18264536 4263659870
## 35: Namibia 2000 2061.62097 1897953 3912859707
## 36: Niger 2000 164.64895 11224523 1848105899
## 37: Nigeria 2000 371.76808 122876723 45681643436
## 38: Rwanda 2000 214.23370 8021875 1718555953
## 39: Sao Tome and Principe 2000 543.99904 137164 74617084
## 40: Senegal 2000 492.28623 9860578 4854226722
## 41: Seychelles 2000 7578.85105 81154 615054078
## 42: Sierra Leone 2000 153.47797 4060709 623229365
## 43: South Africa 2000 3019.94637 44896856 135586097167
## 44: Sudan 2000 358.52920 28079664 10067379498
## 45: Swaziland 2000 1508.18366 1063715 1604277582
## 46: Tanzania 2000 307.98615 33991590 10468939093
## 47: Togo 2000 270.00088 4874735 1316182721
## 48: Tunisia 2000 2245.33506 9699192 21777935842
## 49: Uganda 2000 255.78061 23757636 6076742657
## 50: Zambia 2000 318.92682 10585220 3375910577
## 51: Zimbabwe 2000 534.79115 12499981 6684879233
## country year gdp_per_capita population total_gdp
# Which countries are in Asia but not considered part of the middle east?
fsetdiff(gdp$asia, gdp_middle_east)
## country year gdp_per_capita population total_gdp
## 1: Armenia 2000 621.4248 3076098 1.911564e+09
## 2: Azerbaijan 2000 655.0974 8117742 5.317912e+09
## 3: Bangladesh 2000 363.6399 131280739 4.773892e+10
## 4: Bhutan 2000 768.7510 564187 4.337193e+08
## 5: Brunei 2000 18350.1306 330554 6.065709e+09
## 6: Cambodia 2000 293.5685 12197905 3.580920e+09
## 7: China 2000 949.1781 1269974572 1.205432e+12
## 8: Georgia 2000 691.9977 4743591 3.282554e+09
## 9: Hong Kong, China 2000 25756.6638 6783502 1.747204e+11
## 10: India 2000 450.4151 1053481072 4.745038e+11
## 11: Indonesia 2000 773.3110 211540428 1.635865e+11
## 12: Japan 2000 37291.7062 125714674 4.688115e+12
## 13: Kazakhstan 2000 1229.0010 14956769 1.838188e+10
## 14: Kyrgyz Republic 2000 279.6203 4954850 1.385477e+09
## 15: Lao 2000 325.5931 5342879 1.739605e+09
## 16: Macao, China 2000 14128.8752 431907 6.102360e+09
## 17: Malaysia 2000 4005.5563 23420751 9.381314e+10
## 18: Maldives 2000 2284.9740 280384 6.406701e+08
## 19: Mongolia 2000 471.4733 2397438 1.130328e+09
## 20: Nepal 2000 225.1687 23740145 5.345537e+09
## 21: Pakistan 2000 511.7026 138250487 7.074313e+10
## 22: Philippines 2000 1048.0705 77932247 8.167849e+10
## 23: Russia 2000 1775.1413 146400951 2.598824e+11
## 24: Singapore 2000 23814.5566 3918183 9.330979e+10
## 25: South Korea 2000 11346.6650 46206271 5.242871e+11
## 26: Sri Lanka 2000 854.9267 18783745 1.605873e+10
## 27: Tajikistan 2000 139.4099 6186152 8.624107e+08
## 28: Thailand 2000 1943.2379 62693322 1.218280e+11
## 29: Timor-Leste 2000 380.9230 847185 3.227123e+08
## 30: Turkmenistan 2000 645.2771 4501419 2.904663e+09
## 31: Uzbekistan 2000 558.2211 24518222 1.368659e+10
## 32: Vietnam 2000 401.5478 80285563 3.223849e+10
## country year gdp_per_capita population total_gdp
# Which countries are in Europe but not considered part of the middle east?
fsetdiff(gdp$europe, gdp_middle_east)
## country year gdp_per_capita population total_gdp
## 1: Albania 2000 1200.1374 3121965 3.746787e+09
## 2: Andorra 2000 17539.4420 65399 1.147062e+09
## 3: Armenia 2000 621.4248 3076098 1.911564e+09
## 4: Austria 2000 23974.1831 8050884 1.930134e+11
## 5: Azerbaijan 2000 655.0974 8117742 5.317912e+09
## 6: Belarus 2000 1273.0491 9952055 1.266945e+10
## 7: Belgium 2000 22697.0123 10268380 2.330615e+11
## 8: Bosnia and Herzegovina 2000 1490.6428 3792878 5.653826e+09
## 9: Bulgaria 2000 1579.3482 8000510 1.263559e+10
## 10: Channel Islands 2000 44310.9023 148725 6.590139e+09
## 11: Croatia 2000 4861.6775 4428069 2.152784e+10
## 12: Czech Republic 2000 5724.8375 10263010 5.875406e+10
## 13: Denmark 2000 29980.1555 5338283 1.600426e+11
## 14: Estonia 2000 4143.9272 1399145 5.797955e+09
## 15: Faeroe Islands 2000 23224.0986 46491 1.079712e+09
## 16: Finland 2000 23529.5385 5176482 1.218002e+11
## 17: France 2000 21774.9930 59387183 1.293155e+12
## 18: Georgia 2000 691.9977 4743591 3.282554e+09
## 19: Germany 2000 22945.7088 81895925 1.879160e+12
## 20: Greece 2000 11396.2326 10954032 1.248347e+11
## 21: Hungary 2000 4542.7207 10224113 4.644529e+10
## 22: Iceland 2000 30928.6756 281214 8.697577e+09
## 23: Ireland 2000 25629.6501 3841574 9.845820e+10
## 24: Isle of Man 2000 20359.4625 76806 1.563729e+09
## 25: Italy 2000 19388.2788 57147081 1.107984e+12
## 26: Kazakhstan 2000 1229.0010 14956769 1.838188e+10
## 27: Kosovo 2000 1087.7624 2608347 2.837262e+09
## 28: Latvia 2000 3300.9347 2371481 7.828104e+09
## 29: Liechtenstein 2000 75606.2032 33282 2.516326e+09
## 30: Lithuania 2000 3267.3474 3486373 1.139119e+10
## 31: Luxembourg 2000 46453.2458 436107 2.025859e+10
## 32: Macedonia, FYR 2000 1785.3268 2012051 3.592169e+09
## 33: Malta 2000 10377.0373 387180 4.017781e+09
## 34: Moldova 2000 354.0017 4201088 1.487192e+09
## 35: Monaco 2000 75382.4466 32081 2.418344e+09
## 36: Montenegro 2000 1555.9125 613557 9.546410e+08
## 37: Netherlands 2000 24179.7314 15894016 3.843130e+11
## 38: Norway 2000 37472.6717 4491572 1.683112e+11
## 39: Poland 2000 4454.0802 38486305 1.714211e+11
## 40: Portugal 2000 11470.8979 10278542 1.179041e+11
## 41: Romania 2000 1650.9684 22128139 3.653286e+10
## 42: Russia 2000 1775.1413 146400951 2.598824e+11
## 43: San Marino 2000 28698.3217 27420 7.869080e+08
## 44: Serbia 2000 809.2751 9463306 7.658418e+09
## 45: Slovak Republic 2000 5330.4016 5386065 2.870989e+10
## 46: Slovenia 2000 10045.3601 1988652 1.997673e+10
## 47: Spain 2000 14413.7889 40749800 5.873590e+11
## 48: Sweden 2000 27869.3776 8872284 2.472650e+11
## 49: Switzerland 2000 35639.4789 7165581 2.553776e+11
## 50: Ukraine 2000 635.7090 48746269 3.098844e+10
## 51: United Kingdom 2000 25057.6135 58867004 1.475067e+12
## country year gdp_per_capita population total_gdp
gdp_per_capita_wrong <- fread("./RInputFiles/gdp_per_capita_oceania.csv")
colNames <- gdp_per_capita_wrong$V1
colNames[1] <- "year"
numData <- t(gdp_per_capita_wrong[, -1])
gdp_per_capita <- as.data.table(numData)
colnames(gdp_per_capita) <- colNames
# Print gdp_per_capita
gdp_per_capita
## year Australia Fiji French Polynesia Kiribati Marshall Islands
## 1: 1990 17553.38 1833.184 14003.26 670.2639 2137.690
## 2: 1995 18690.44 1955.294 13608.05 661.5434 2581.130
## 3: 2000 21708.04 2074.747 14507.54 812.2821 2127.485
## 4: 2005 23929.16 2308.158 NA 785.3737 2368.149
## 5: 2010 25190.84 2218.147 NA 713.5625 2437.282
## Micronesia, Fed. Sts. New Caledonia New Zealand Palau Papua New Guinea
## 1: 1894.105 13562.65 11627.11 NA 566.8707
## 2: 2189.552 13601.46 12291.84 6355.627 754.7247
## 3: 2177.589 12579.60 13375.78 6251.982 654.6688
## 4: 2196.247 NA 15171.59 6429.586 626.3082
## 5: 2134.037 NA 14629.22 5756.811 744.2105
## Samoa Solomon Islands Tonga Tuvalu Vanuatu
## 1: 1191.783 1101.2480 1493.743 1128.078 1327.357
## 2: 1204.353 1403.8032 1785.211 1278.696 1358.466
## 3: 1391.214 1064.5147 1926.005 1458.950 1469.762
## 4: 1742.154 975.6054 2115.027 1426.191 1355.822
## 5: 1769.565 1143.7875 2069.226 1559.984 1522.384
# Reshape gdp_per_capita to the long format
melt(gdp_per_capita, id.vars="year")
## year variable value
## 1: 1990 Australia 17553.3768
## 2: 1995 Australia 18690.4366
## 3: 2000 Australia 21708.0373
## 4: 2005 Australia 23929.1644
## 5: 2010 Australia 25190.8399
## 6: 1990 Fiji 1833.1844
## 7: 1995 Fiji 1955.2939
## 8: 2000 Fiji 2074.7473
## 9: 2005 Fiji 2308.1583
## 10: 2010 Fiji 2218.1470
## 11: 1990 French Polynesia 14003.2649
## 12: 1995 French Polynesia 13608.0520
## 13: 2000 French Polynesia 14507.5415
## 14: 2005 French Polynesia NA
## 15: 2010 French Polynesia NA
## 16: 1990 Kiribati 670.2639
## 17: 1995 Kiribati 661.5434
## 18: 2000 Kiribati 812.2821
## 19: 2005 Kiribati 785.3737
## 20: 2010 Kiribati 713.5625
## 21: 1990 Marshall Islands 2137.6897
## 22: 1995 Marshall Islands 2581.1304
## 23: 2000 Marshall Islands 2127.4855
## 24: 2005 Marshall Islands 2368.1488
## 25: 2010 Marshall Islands 2437.2824
## 26: 1990 Micronesia, Fed. Sts. 1894.1055
## 27: 1995 Micronesia, Fed. Sts. 2189.5520
## 28: 2000 Micronesia, Fed. Sts. 2177.5891
## 29: 2005 Micronesia, Fed. Sts. 2196.2472
## 30: 2010 Micronesia, Fed. Sts. 2134.0372
## 31: 1990 New Caledonia 13562.6526
## 32: 1995 New Caledonia 13601.4555
## 33: 2000 New Caledonia 12579.5951
## 34: 2005 New Caledonia NA
## 35: 2010 New Caledonia NA
## 36: 1990 New Zealand 11627.1065
## 37: 1995 New Zealand 12291.8376
## 38: 2000 New Zealand 13375.7805
## 39: 2005 New Zealand 15171.5943
## 40: 2010 New Zealand 14629.2181
## 41: 1990 Palau NA
## 42: 1995 Palau 6355.6270
## 43: 2000 Palau 6251.9821
## 44: 2005 Palau 6429.5859
## 45: 2010 Palau 5756.8105
## 46: 1990 Papua New Guinea 566.8707
## 47: 1995 Papua New Guinea 754.7247
## 48: 2000 Papua New Guinea 654.6688
## 49: 2005 Papua New Guinea 626.3082
## 50: 2010 Papua New Guinea 744.2105
## 51: 1990 Samoa 1191.7833
## 52: 1995 Samoa 1204.3528
## 53: 2000 Samoa 1391.2144
## 54: 2005 Samoa 1742.1545
## 55: 2010 Samoa 1769.5650
## 56: 1990 Solomon Islands 1101.2480
## 57: 1995 Solomon Islands 1403.8032
## 58: 2000 Solomon Islands 1064.5147
## 59: 2005 Solomon Islands 975.6054
## 60: 2010 Solomon Islands 1143.7875
## 61: 1990 Tonga 1493.7433
## 62: 1995 Tonga 1785.2106
## 63: 2000 Tonga 1926.0046
## 64: 2005 Tonga 2115.0271
## 65: 2010 Tonga 2069.2262
## 66: 1990 Tuvalu 1128.0776
## 67: 1995 Tuvalu 1278.6963
## 68: 2000 Tuvalu 1458.9496
## 69: 2005 Tuvalu 1426.1913
## 70: 2010 Tuvalu 1559.9837
## 71: 1990 Vanuatu 1327.3566
## 72: 1995 Vanuatu 1358.4658
## 73: 2000 Vanuatu 1469.7618
## 74: 2005 Vanuatu 1355.8217
## 75: 2010 Vanuatu 1522.3840
## year variable value
# Rename the new columns
melt(gdp_per_capita, id.vars = "year", variable.name="country", value.name="gdp_pc")
## year country gdp_pc
## 1: 1990 Australia 17553.3768
## 2: 1995 Australia 18690.4366
## 3: 2000 Australia 21708.0373
## 4: 2005 Australia 23929.1644
## 5: 2010 Australia 25190.8399
## 6: 1990 Fiji 1833.1844
## 7: 1995 Fiji 1955.2939
## 8: 2000 Fiji 2074.7473
## 9: 2005 Fiji 2308.1583
## 10: 2010 Fiji 2218.1470
## 11: 1990 French Polynesia 14003.2649
## 12: 1995 French Polynesia 13608.0520
## 13: 2000 French Polynesia 14507.5415
## 14: 2005 French Polynesia NA
## 15: 2010 French Polynesia NA
## 16: 1990 Kiribati 670.2639
## 17: 1995 Kiribati 661.5434
## 18: 2000 Kiribati 812.2821
## 19: 2005 Kiribati 785.3737
## 20: 2010 Kiribati 713.5625
## 21: 1990 Marshall Islands 2137.6897
## 22: 1995 Marshall Islands 2581.1304
## 23: 2000 Marshall Islands 2127.4855
## 24: 2005 Marshall Islands 2368.1488
## 25: 2010 Marshall Islands 2437.2824
## 26: 1990 Micronesia, Fed. Sts. 1894.1055
## 27: 1995 Micronesia, Fed. Sts. 2189.5520
## 28: 2000 Micronesia, Fed. Sts. 2177.5891
## 29: 2005 Micronesia, Fed. Sts. 2196.2472
## 30: 2010 Micronesia, Fed. Sts. 2134.0372
## 31: 1990 New Caledonia 13562.6526
## 32: 1995 New Caledonia 13601.4555
## 33: 2000 New Caledonia 12579.5951
## 34: 2005 New Caledonia NA
## 35: 2010 New Caledonia NA
## 36: 1990 New Zealand 11627.1065
## 37: 1995 New Zealand 12291.8376
## 38: 2000 New Zealand 13375.7805
## 39: 2005 New Zealand 15171.5943
## 40: 2010 New Zealand 14629.2181
## 41: 1990 Palau NA
## 42: 1995 Palau 6355.6270
## 43: 2000 Palau 6251.9821
## 44: 2005 Palau 6429.5859
## 45: 2010 Palau 5756.8105
## 46: 1990 Papua New Guinea 566.8707
## 47: 1995 Papua New Guinea 754.7247
## 48: 2000 Papua New Guinea 654.6688
## 49: 2005 Papua New Guinea 626.3082
## 50: 2010 Papua New Guinea 744.2105
## 51: 1990 Samoa 1191.7833
## 52: 1995 Samoa 1204.3528
## 53: 2000 Samoa 1391.2144
## 54: 2005 Samoa 1742.1545
## 55: 2010 Samoa 1769.5650
## 56: 1990 Solomon Islands 1101.2480
## 57: 1995 Solomon Islands 1403.8032
## 58: 2000 Solomon Islands 1064.5147
## 59: 2005 Solomon Islands 975.6054
## 60: 2010 Solomon Islands 1143.7875
## 61: 1990 Tonga 1493.7433
## 62: 1995 Tonga 1785.2106
## 63: 2000 Tonga 1926.0046
## 64: 2005 Tonga 2115.0271
## 65: 2010 Tonga 2069.2262
## 66: 1990 Tuvalu 1128.0776
## 67: 1995 Tuvalu 1278.6963
## 68: 2000 Tuvalu 1458.9496
## 69: 2005 Tuvalu 1426.1913
## 70: 2010 Tuvalu 1559.9837
## 71: 1990 Vanuatu 1327.3566
## 72: 1995 Vanuatu 1358.4658
## 73: 2000 Vanuatu 1469.7618
## 74: 2005 Vanuatu 1355.8217
## 75: 2010 Vanuatu 1522.3840
## year country gdp_pc
# Print ebola_wide
ebola_wide <- rbind(ebola_W50, ebola_W51) %>%
mutate(Week_50=ifelse(period_code=="2014-W50", Confirmed, NA),
Week_51=ifelse(period_code=="2014-W51", Confirmed, NA)
) %>%
select(Location, period_start, period_end, Week_50, Week_51) %>%
arrange(Location, period_start)
ebola_wide
## Location period_start period_end Week_50 Week_51
## 1 BEYLA 15 December 2014 21 December 2014 NA 3
## 2 CONAKRY 08 December 2014 14 December 2014 37 NA
## 3 CONAKRY 15 December 2014 21 December 2014 NA 39
## 4 COYAH 08 December 2014 14 December 2014 31 NA
## 5 COYAH 15 December 2014 21 December 2014 NA 35
## 6 DABOLA 08 December 2014 14 December 2014 2 NA
## 7 DABOLA 15 December 2014 21 December 2014 NA 1
## 8 DUBREKA 08 December 2014 14 December 2014 6 NA
## 9 DUBREKA 15 December 2014 21 December 2014 NA 5
## 10 FARANAH 08 December 2014 14 December 2014 0 NA
## 11 FORECARIAH 08 December 2014 14 December 2014 22 NA
## 12 FORECARIAH 15 December 2014 21 December 2014 NA 15
## 13 FRIA 15 December 2014 21 December 2014 NA 1
## 14 GUECKEDOU 08 December 2014 14 December 2014 3 NA
## 15 GUECKEDOU 15 December 2014 21 December 2014 NA 7
## 16 KANKAN 08 December 2014 14 December 2014 2 NA
## 17 KANKAN 15 December 2014 21 December 2014 NA 2
## 18 KEROUANE 08 December 2014 14 December 2014 9 NA
## 19 KEROUANE 15 December 2014 21 December 2014 NA 17
## 20 KINDIA 08 December 2014 14 December 2014 1 NA
## 21 KINDIA 15 December 2014 21 December 2014 NA 21
## 22 KISSIDOUGOU 08 December 2014 14 December 2014 2 NA
## 23 KISSIDOUGOU 15 December 2014 21 December 2014 NA 117
## 24 KOUROUSSA 08 December 2014 14 December 2014 3 NA
## 25 LOLA 08 December 2014 14 December 2014 4 NA
## 26 LOLA 15 December 2014 21 December 2014 NA 14
## 27 MACENTA 08 December 2014 14 December 2014 6 NA
## 28 MACENTA 15 December 2014 21 December 2014 NA 24
## 29 N'ZEREKORE 08 December 2014 14 December 2014 13 NA
## 30 N'ZEREKORE 15 December 2014 21 December 2014 NA 18
## 31 SIGUIRI 08 December 2014 14 December 2014 0 NA
## 32 TELIMELE 08 December 2014 14 December 2014 10 NA
## 33 TELIMELE 15 December 2014 21 December 2014 NA 6
# Stack Week_50 and Week_51
melt(ebola_wide, measure.vars=c("Week_50", "Week_51"), variable.name="period", value.name="cases")
## Warning in melt(ebola_wide, measure.vars = c("Week_50", "Week_51"),
## variable.name = "period", : The melt generic in data.table has been passed
## a data.frame and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(ebola_wide). In the next version, this warning will become an
## error.
## Location period_start period_end period cases
## 1 BEYLA 15 December 2014 21 December 2014 Week_50 NA
## 2 CONAKRY 08 December 2014 14 December 2014 Week_50 37
## 3 CONAKRY 15 December 2014 21 December 2014 Week_50 NA
## 4 COYAH 08 December 2014 14 December 2014 Week_50 31
## 5 COYAH 15 December 2014 21 December 2014 Week_50 NA
## 6 DABOLA 08 December 2014 14 December 2014 Week_50 2
## 7 DABOLA 15 December 2014 21 December 2014 Week_50 NA
## 8 DUBREKA 08 December 2014 14 December 2014 Week_50 6
## 9 DUBREKA 15 December 2014 21 December 2014 Week_50 NA
## 10 FARANAH 08 December 2014 14 December 2014 Week_50 0
## 11 FORECARIAH 08 December 2014 14 December 2014 Week_50 22
## 12 FORECARIAH 15 December 2014 21 December 2014 Week_50 NA
## 13 FRIA 15 December 2014 21 December 2014 Week_50 NA
## 14 GUECKEDOU 08 December 2014 14 December 2014 Week_50 3
## 15 GUECKEDOU 15 December 2014 21 December 2014 Week_50 NA
## 16 KANKAN 08 December 2014 14 December 2014 Week_50 2
## 17 KANKAN 15 December 2014 21 December 2014 Week_50 NA
## 18 KEROUANE 08 December 2014 14 December 2014 Week_50 9
## 19 KEROUANE 15 December 2014 21 December 2014 Week_50 NA
## 20 KINDIA 08 December 2014 14 December 2014 Week_50 1
## 21 KINDIA 15 December 2014 21 December 2014 Week_50 NA
## 22 KISSIDOUGOU 08 December 2014 14 December 2014 Week_50 2
## 23 KISSIDOUGOU 15 December 2014 21 December 2014 Week_50 NA
## 24 KOUROUSSA 08 December 2014 14 December 2014 Week_50 3
## 25 LOLA 08 December 2014 14 December 2014 Week_50 4
## 26 LOLA 15 December 2014 21 December 2014 Week_50 NA
## 27 MACENTA 08 December 2014 14 December 2014 Week_50 6
## 28 MACENTA 15 December 2014 21 December 2014 Week_50 NA
## 29 N'ZEREKORE 08 December 2014 14 December 2014 Week_50 13
## 30 N'ZEREKORE 15 December 2014 21 December 2014 Week_50 NA
## 31 SIGUIRI 08 December 2014 14 December 2014 Week_50 0
## 32 TELIMELE 08 December 2014 14 December 2014 Week_50 10
## 33 TELIMELE 15 December 2014 21 December 2014 Week_50 NA
## 34 BEYLA 15 December 2014 21 December 2014 Week_51 3
## 35 CONAKRY 08 December 2014 14 December 2014 Week_51 NA
## 36 CONAKRY 15 December 2014 21 December 2014 Week_51 39
## 37 COYAH 08 December 2014 14 December 2014 Week_51 NA
## 38 COYAH 15 December 2014 21 December 2014 Week_51 35
## 39 DABOLA 08 December 2014 14 December 2014 Week_51 NA
## 40 DABOLA 15 December 2014 21 December 2014 Week_51 1
## 41 DUBREKA 08 December 2014 14 December 2014 Week_51 NA
## 42 DUBREKA 15 December 2014 21 December 2014 Week_51 5
## 43 FARANAH 08 December 2014 14 December 2014 Week_51 NA
## 44 FORECARIAH 08 December 2014 14 December 2014 Week_51 NA
## 45 FORECARIAH 15 December 2014 21 December 2014 Week_51 15
## 46 FRIA 15 December 2014 21 December 2014 Week_51 1
## 47 GUECKEDOU 08 December 2014 14 December 2014 Week_51 NA
## 48 GUECKEDOU 15 December 2014 21 December 2014 Week_51 7
## 49 KANKAN 08 December 2014 14 December 2014 Week_51 NA
## 50 KANKAN 15 December 2014 21 December 2014 Week_51 2
## 51 KEROUANE 08 December 2014 14 December 2014 Week_51 NA
## 52 KEROUANE 15 December 2014 21 December 2014 Week_51 17
## 53 KINDIA 08 December 2014 14 December 2014 Week_51 NA
## 54 KINDIA 15 December 2014 21 December 2014 Week_51 21
## 55 KISSIDOUGOU 08 December 2014 14 December 2014 Week_51 NA
## 56 KISSIDOUGOU 15 December 2014 21 December 2014 Week_51 117
## 57 KOUROUSSA 08 December 2014 14 December 2014 Week_51 NA
## 58 LOLA 08 December 2014 14 December 2014 Week_51 NA
## 59 LOLA 15 December 2014 21 December 2014 Week_51 14
## 60 MACENTA 08 December 2014 14 December 2014 Week_51 NA
## 61 MACENTA 15 December 2014 21 December 2014 Week_51 24
## 62 N'ZEREKORE 08 December 2014 14 December 2014 Week_51 NA
## 63 N'ZEREKORE 15 December 2014 21 December 2014 Week_51 18
## 64 SIGUIRI 08 December 2014 14 December 2014 Week_51 NA
## 65 TELIMELE 08 December 2014 14 December 2014 Week_51 NA
## 66 TELIMELE 15 December 2014 21 December 2014 Week_51 6
# Modify the code
melt(ebola_wide, measure.vars = c("Week_50", "Week_51"),
variable.name = "period", value.name = "cases", id.vars="Location"
)
## Warning in melt(ebola_wide, measure.vars = c("Week_50", "Week_51"),
## variable.name = "period", : The melt generic in data.table has been passed
## a data.frame and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(ebola_wide). In the next version, this warning will become an
## error.
## Location period cases
## 1 BEYLA Week_50 NA
## 2 CONAKRY Week_50 37
## 3 CONAKRY Week_50 NA
## 4 COYAH Week_50 31
## 5 COYAH Week_50 NA
## 6 DABOLA Week_50 2
## 7 DABOLA Week_50 NA
## 8 DUBREKA Week_50 6
## 9 DUBREKA Week_50 NA
## 10 FARANAH Week_50 0
## 11 FORECARIAH Week_50 22
## 12 FORECARIAH Week_50 NA
## 13 FRIA Week_50 NA
## 14 GUECKEDOU Week_50 3
## 15 GUECKEDOU Week_50 NA
## 16 KANKAN Week_50 2
## 17 KANKAN Week_50 NA
## 18 KEROUANE Week_50 9
## 19 KEROUANE Week_50 NA
## 20 KINDIA Week_50 1
## 21 KINDIA Week_50 NA
## 22 KISSIDOUGOU Week_50 2
## 23 KISSIDOUGOU Week_50 NA
## 24 KOUROUSSA Week_50 3
## 25 LOLA Week_50 4
## 26 LOLA Week_50 NA
## 27 MACENTA Week_50 6
## 28 MACENTA Week_50 NA
## 29 N'ZEREKORE Week_50 13
## 30 N'ZEREKORE Week_50 NA
## 31 SIGUIRI Week_50 0
## 32 TELIMELE Week_50 10
## 33 TELIMELE Week_50 NA
## 34 BEYLA Week_51 3
## 35 CONAKRY Week_51 NA
## 36 CONAKRY Week_51 39
## 37 COYAH Week_51 NA
## 38 COYAH Week_51 35
## 39 DABOLA Week_51 NA
## 40 DABOLA Week_51 1
## 41 DUBREKA Week_51 NA
## 42 DUBREKA Week_51 5
## 43 FARANAH Week_51 NA
## 44 FORECARIAH Week_51 NA
## 45 FORECARIAH Week_51 15
## 46 FRIA Week_51 1
## 47 GUECKEDOU Week_51 NA
## 48 GUECKEDOU Week_51 7
## 49 KANKAN Week_51 NA
## 50 KANKAN Week_51 2
## 51 KEROUANE Week_51 NA
## 52 KEROUANE Week_51 17
## 53 KINDIA Week_51 NA
## 54 KINDIA Week_51 21
## 55 KISSIDOUGOU Week_51 NA
## 56 KISSIDOUGOU Week_51 117
## 57 KOUROUSSA Week_51 NA
## 58 LOLA Week_51 NA
## 59 LOLA Week_51 14
## 60 MACENTA Week_51 NA
## 61 MACENTA Week_51 24
## 62 N'ZEREKORE Week_51 NA
## 63 N'ZEREKORE Week_51 18
## 64 SIGUIRI Week_51 NA
## 65 TELIMELE Week_51 NA
## 66 TELIMELE Week_51 6
gdp_oceania <- fread("./RInputFiles/gdp_and_pop_oceania.csv")
gdp_oceania$continent <- "Oceania"
# Split the population column by year
dcast(gdp_oceania, formula = country ~ year, value.var = "population")
## country 1990 1995 2000 2005 2010
## 1: Australia 17096869 18124770 19107251 20274282 22162863
## 2: Fiji 728626 775498 811223 821820 859952
## 3: French Polynesia 198370 215200 237267 254884 268065
## 4: Kiribati 72411 77727 84406 92329 102648
## 5: Marshall Islands 47300 51020 52161 52058 52428
## 6: Micronesia, Fed. Sts. 96331 107556 107430 106198 103619
## 7: New Caledonia 168537 189198 209997 228683 246345
## 8: New Zealand 3397534 3674886 3858234 4134699 4369027
## 9: Palau 15089 17255 19174 19907 20470
## 10: Papua New Guinea 4157903 4715929 5374051 6086905 6847517
## 11: Samoa 162865 170158 174614 179928 186029
## 12: Solomon Islands 311849 359236 412336 469306 526177
## 13: Tonga 95152 95889 97898 100858 103947
## 14: Tuvalu 9004 9227 9419 9694 9827
## 15: Vanuatu 146633 168236 185058 209375 236299
# Split the gdp column by country
dcast(gdp_oceania, formula = year ~ country, value.var = "gdp")
## year Australia Fiji French Polynesia Kiribati Marshall Islands
## 1: 1990 300107784341 1335705790 2777827650 48534476 101112724
## 2: 1995 338759864212 1516326476 2928452782 51419787 131689273
## 3: 2000 414780916644 1683082744 3442160840 68561481 110971769
## 4: 2005 485146627070 1896890618 NA 72512766 123281092
## 5: 2010 558301132672 1907499910 NA 73245763 127781844
## Micronesia, Fed. Sts. New Caledonia New Zealand Palau Papua New Guinea
## 1: 182461073 2285808773 39503489723 NA 2356993537
## 2: 235499455 2573368170 45171101764 109666344 3559227888
## 3: 233938394 2641677234 51606891179 119875504 3518223421
## 4: 233237060 NA 62729975739 127993766 3812278808
## 5: 221126797 NA 63915448868 117841911 5095994376
## Samoa Solomon Islands Tonga Tuvalu Vanuatu
## 1: 194099795 343423086 142132664 10157211 194634278
## 2: 204930258 504296634 171182064 11798530 228542854
## 3: 242925503 438937728 188551996 13741847 271991177
## 4: 313462373 457857482 213317400 13825499 283875173
## 5: 329190409 601834699 215089852 15329960 359737826
# Reshape from wide to long format
wide <- dcast(gdp_oceania, formula = country ~ year, value.var = c("gdp", "population"))
# convert to a matrix
as.matrix(wide, rownames="country")
## gdp_1990 gdp_1995 gdp_2000 gdp_2005
## Australia 300107784341 338759864212 414780916644 485146627070
## Fiji 1335705790 1516326476 1683082744 1896890618
## French Polynesia 2777827650 2928452782 3442160840 NA
## Kiribati 48534476 51419787 68561481 72512766
## Marshall Islands 101112724 131689273 110971769 123281092
## Micronesia, Fed. Sts. 182461073 235499455 233938394 233237060
## New Caledonia 2285808773 2573368170 2641677234 NA
## New Zealand 39503489723 45171101764 51606891179 62729975739
## Palau NA 109666344 119875504 127993766
## Papua New Guinea 2356993537 3559227888 3518223421 3812278808
## Samoa 194099795 204930258 242925503 313462373
## Solomon Islands 343423086 504296634 438937728 457857482
## Tonga 142132664 171182064 188551996 213317400
## Tuvalu 10157211 11798530 13741847 13825499
## Vanuatu 194634278 228542854 271991177 283875173
## gdp_2010 population_1990 population_1995
## Australia 558301132672 17096869 18124770
## Fiji 1907499910 728626 775498
## French Polynesia NA 198370 215200
## Kiribati 73245763 72411 77727
## Marshall Islands 127781844 47300 51020
## Micronesia, Fed. Sts. 221126797 96331 107556
## New Caledonia NA 168537 189198
## New Zealand 63915448868 3397534 3674886
## Palau 117841911 15089 17255
## Papua New Guinea 5095994376 4157903 4715929
## Samoa 329190409 162865 170158
## Solomon Islands 601834699 311849 359236
## Tonga 215089852 95152 95889
## Tuvalu 15329960 9004 9227
## Vanuatu 359737826 146633 168236
## population_2000 population_2005 population_2010
## Australia 19107251 20274282 22162863
## Fiji 811223 821820 859952
## French Polynesia 237267 254884 268065
## Kiribati 84406 92329 102648
## Marshall Islands 52161 52058 52428
## Micronesia, Fed. Sts. 107430 106198 103619
## New Caledonia 209997 228683 246345
## New Zealand 3858234 4134699 4369027
## Palau 19174 19907 20470
## Papua New Guinea 5374051 6086905 6847517
## Samoa 174614 179928 186029
## Solomon Islands 412336 469306 526177
## Tonga 97898 100858 103947
## Tuvalu 9419 9694 9827
## Vanuatu 185058 209375 236299
# Modify your previous code
dcast(gdp_oceania, formula = continent + country ~ year, value.var = c("gdp", "population"))
## continent country gdp_1990 gdp_1995 gdp_2000
## 1: Oceania Australia 300107784341 338759864212 414780916644
## 2: Oceania Fiji 1335705790 1516326476 1683082744
## 3: Oceania French Polynesia 2777827650 2928452782 3442160840
## 4: Oceania Kiribati 48534476 51419787 68561481
## 5: Oceania Marshall Islands 101112724 131689273 110971769
## 6: Oceania Micronesia, Fed. Sts. 182461073 235499455 233938394
## 7: Oceania New Caledonia 2285808773 2573368170 2641677234
## 8: Oceania New Zealand 39503489723 45171101764 51606891179
## 9: Oceania Palau NA 109666344 119875504
## 10: Oceania Papua New Guinea 2356993537 3559227888 3518223421
## 11: Oceania Samoa 194099795 204930258 242925503
## 12: Oceania Solomon Islands 343423086 504296634 438937728
## 13: Oceania Tonga 142132664 171182064 188551996
## 14: Oceania Tuvalu 10157211 11798530 13741847
## 15: Oceania Vanuatu 194634278 228542854 271991177
## gdp_2005 gdp_2010 population_1990 population_1995 population_2000
## 1: 485146627070 558301132672 17096869 18124770 19107251
## 2: 1896890618 1907499910 728626 775498 811223
## 3: NA NA 198370 215200 237267
## 4: 72512766 73245763 72411 77727 84406
## 5: 123281092 127781844 47300 51020 52161
## 6: 233237060 221126797 96331 107556 107430
## 7: NA NA 168537 189198 209997
## 8: 62729975739 63915448868 3397534 3674886 3858234
## 9: 127993766 117841911 15089 17255 19174
## 10: 3812278808 5095994376 4157903 4715929 5374051
## 11: 313462373 329190409 162865 170158 174614
## 12: 457857482 601834699 311849 359236 412336
## 13: 213317400 215089852 95152 95889 97898
## 14: 13825499 15329960 9004 9227 9419
## 15: 283875173 359737826 146633 168236 185058
## population_2005 population_2010
## 1: 20274282 22162863
## 2: 821820 859952
## 3: 254884 268065
## 4: 92329 102648
## 5: 52058 52428
## 6: 106198 103619
## 7: 228683 246345
## 8: 4134699 4369027
## 9: 19907 20470
## 10: 6086905 6847517
## 11: 179928 186029
## 12: 469306 526177
## 13: 100858 103947
## 14: 9694 9827
## 15: 209375 236299
gdp_by_industry_oceania <- fread("./RInputFiles/gdp_by_industry_oceania.tsv")
# Split gdp by industry and year
gdp_by_industry_oceania
## country year population industry gdp
## 1: Australia 1995 18124770 agriculture 203255918527
## 2: Australia 1995 18124770 tourism 16937993211
## 3: Australia 2010 22162863 agriculture 251235509702
## 4: Australia 2010 22162863 tourism 83745169901
## 5: Fiji 1995 775498 agriculture 909795885
## 6: Fiji 1995 775498 tourism 75816324
## 7: Fiji 2010 859952 agriculture 858374960
## 8: Fiji 2010 859952 tourism 286124987
## 9: French Polynesia 1995 215200 agriculture 1757071669
## 10: French Polynesia 1995 215200 tourism 146422639
## 11: French Polynesia 2010 268065 agriculture <NA>
## 12: French Polynesia 2010 268065 tourism <NA>
## 13: Kiribati 1995 77727 agriculture 30851872
## 14: Kiribati 1995 77727 tourism 2570989
## 15: Kiribati 2010 102648 agriculture 32960593
## 16: Kiribati 2010 102648 tourism 10986864
## 17: Marshall Islands 1995 51020 agriculture 79013564
## 18: Marshall Islands 1995 51020 tourism 6584464
## 19: Marshall Islands 2010 52428 agriculture 57501830
## 20: Marshall Islands 2010 52428 tourism 19167277
## 21: Micronesia, Fed. Sts. 1995 107556 agriculture 141299673
## 22: Micronesia, Fed. Sts. 1995 107556 tourism 11774973
## 23: Micronesia, Fed. Sts. 2010 103619 agriculture 99507059
## 24: Micronesia, Fed. Sts. 2010 103619 tourism 33169020
## 25: New Caledonia 1995 189198 agriculture 1544020902
## 26: New Caledonia 1995 189198 tourism 128668409
## 27: New Caledonia 2010 246345 agriculture <NA>
## 28: New Caledonia 2010 246345 tourism <NA>
## 29: New Zealand 1995 3674886 agriculture 27102661058
## 30: New Zealand 1995 3674886 tourism 2258555088
## 31: New Zealand 2010 4369027 agriculture 28761951991
## 32: New Zealand 2010 4369027 tourism 9587317330
## 33: Palau 1995 17255 agriculture 65799806
## 34: Palau 1995 17255 tourism 5483317
## 35: Palau 2010 20470 agriculture 53028860
## 36: Palau 2010 20470 tourism 17676287
## 37: Papua New Guinea 1995 4715929 agriculture 2135536733
## 38: Papua New Guinea 1995 4715929 tourism 177961394
## 39: Papua New Guinea 2010 6847517 agriculture 2293197469
## 40: Papua New Guinea 2010 6847517 tourism 764399156
## 41: Samoa 1995 170158 agriculture 122958155
## 42: Samoa 1995 170158 tourism 10246513
## 43: Samoa 2010 186029 agriculture 148135684
## 44: Samoa 2010 186029 tourism 49378561
## 45: Solomon Islands 1995 359236 agriculture 302577980
## 46: Solomon Islands 1995 359236 tourism 25214832
## 47: Solomon Islands 2010 526177 agriculture 270825615
## 48: Solomon Islands 2010 526177 tourism 90275205
## 49: Tonga 1995 95889 agriculture 102709238
## 50: Tonga 1995 95889 tourism 8559103
## 51: Tonga 2010 103947 agriculture 96790433
## 52: Tonga 2010 103947 tourism 32263478
## 53: Tuvalu 1995 9227 agriculture 7079118
## 54: Tuvalu 1995 9227 tourism 589927
## 55: Tuvalu 2010 9827 agriculture 6898482
## 56: Tuvalu 2010 9827 tourism 2299494
## 57: Vanuatu 1995 168236 agriculture 137125712
## 58: Vanuatu 1995 168236 tourism 11427143
## 59: Vanuatu 2010 236299 agriculture 161882022
## 60: Vanuatu 2010 236299 tourism 53960674
## country year population industry gdp
dcast(gdp_by_industry_oceania, formula = country ~ industry + year, value.var=c("gdp"))
## country agriculture_1995 agriculture_2010 tourism_1995
## 1: Australia 203255918527 251235509702 16937993211
## 2: Fiji 909795885 858374960 75816324
## 3: French Polynesia 1757071669 <NA> 146422639
## 4: Kiribati 30851872 32960593 2570989
## 5: Marshall Islands 79013564 57501830 6584464
## 6: Micronesia, Fed. Sts. 141299673 99507059 11774973
## 7: New Caledonia 1544020902 <NA> 128668409
## 8: New Zealand 27102661058 28761951991 2258555088
## 9: Palau 65799806 53028860 5483317
## 10: Papua New Guinea 2135536733 2293197469 177961394
## 11: Samoa 122958155 148135684 10246513
## 12: Solomon Islands 302577980 270825615 25214832
## 13: Tonga 102709238 96790433 8559103
## 14: Tuvalu 7079118 6898482 589927
## 15: Vanuatu 137125712 161882022 11427143
## tourism_2010
## 1: 83745169901
## 2: 286124987
## 3: <NA>
## 4: 10986864
## 5: 19167277
## 6: 33169020
## 7: <NA>
## 8: 9587317330
## 9: 17676287
## 10: 764399156
## 11: 49378561
## 12: 90275205
## 13: 32263478
## 14: 2299494
## 15: 53960674
Chapter 1 - Introduction and Motivation
Introduction and Motivation:
Time features:
ts_history <- ts[1:(i-1)] estimates <- mle.vonmises(ts_history) p_mean <- estimates$mu %% 24 concentration <- estimates$kappa dens_i <- dvonmises(ts[i], mu = p_mean, kappa = concentration) alpha <- 0.90 quantile <- qvonmises((1-alpha)/2, mu=p_mean, kappa=concentration) %% 24 cutoff <- dvonmises(quantile, mu = p_mean, kappa = concentration) time_feature[i] <- dens_i >= cutoff Frequency features:
n <- length(steps) frequency <- sum(auth_method[1:n] == auth_method[n + 1]) return(frequency) Recency features:
n_t <- length(t) if (freq_auth[n_t] == 0) { recency <- 0 } else { time_diff <- t[1] - max(t[2:n_t][auth_cd[(n_t-1):1] == auth_cd[n_t]]) recency <- exp(-gamma * time_diff) }return(recency) group_by(account_name) %>% mutate(rec_auth = zoo::rollapply(timestamp, width = list(0:-length(transfer_id)), partial = TRUE, FUN = recency_fun, gamma, authentication_cd, freq_auth)) Example code includes:
load("./RInputFiles/transfers02_v2.RData") # data.frame transfers is 628x12
# Print the first 6 rows of the dataset
head(transfers)
## fraud_flag transfer_id timestamp orig_account_id benef_account_id
## 226 0 xtr215694 21966103 X27769025 X86111129
## 141 0 xtr671675 40885557 X15452684 X63932196
## 493 0 xtr977348 19945191 X96278924 X56011266
## 240 0 xtr655123 27404301 X27769025 X95653232
## 445 0 xtr785302 6566236 X96278924 X85352318
## 88 0 xtr187306 17576922 X15452684 X18544316
## benef_country channel_cd authentication_cd communication_cd
## 226 ISO03 CH01 AU02 COM02
## 141 ISO03 CH03 AU02 COM02
## 493 ISO03 CH04 AU05 COM02
## 240 ISO03 CH01 AU04 COM02
## 445 ISO04 CH07 AU04 COM01
## 88 ISO03 CH03 AU02 COM01
## empty_communication_flag orig_balance_before amount
## 226 0 5412 33
## 141 0 7268 40
## 493 0 1971 227
## 240 0 10603 20
## 445 0 6228 5176
## 88 0 4933 54
# Display the structure of the dataset
str(transfers)
## 'data.frame': 628 obs. of 12 variables:
## $ fraud_flag : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ transfer_id : Factor w/ 785 levels "xtr402538","xtr517350",..: 226 141 493 240 445 88 714 475 97 132 ...
## $ timestamp : num 21966103 40885557 19945191 27404301 6566236 ...
## $ orig_account_id : Factor w/ 9 levels "X51735094","X15452684",..: 3 2 6 3 6 2 8 6 2 2 ...
## $ benef_account_id : Factor w/ 301 levels "X71039384","X82542502",..: 78 31 199 87 180 11 276 179 28 11 ...
## $ benef_country : Factor w/ 4 levels "ISO03","ISO02",..: 1 1 1 1 4 1 1 1 1 1 ...
## $ channel_cd : Factor w/ 7 levels "CH07","CH05",..: 7 3 6 7 1 3 7 6 3 3 ...
## $ authentication_cd : Factor w/ 5 levels "AU02","AU04",..: 1 1 3 2 2 1 2 3 1 1 ...
## $ communication_cd : Factor w/ 3 levels "COM02","COM03",..: 1 1 1 1 3 3 3 3 1 1 ...
## $ empty_communication_flag: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ orig_balance_before : num 5412 7268 1971 10603 6228 ...
## $ amount : num 33 40 227 20 5176 ...
# Determine fraction of legitimate and fraudulent cases
class_distribution <- prop.table(table(transfers$fraud_flag))
print(class_distribution)
##
## 0 1
## 0.97770701 0.02229299
# Make pie chart of column fraud_flag
df <- data.frame(class = c("no fraud", "fraud"), pct = as.numeric(class_distribution)) %>%
mutate(class = factor(class, levels = c("no fraud", "fraud")),
cumulative = cumsum(pct), midpoint = cumulative - pct / 2,
label = paste0(class, " ", round(pct*100, 2), "%")
)
ggplot(df, aes(x = 1, weight = pct, fill = class)) +
scale_fill_manual(values = c("dodgerblue", "red")) +
geom_bar(width = 1, position = "stack") +
coord_polar(theta = "y") +
geom_text(aes(x = 1.3, y = midpoint, label = label)) +
ggmap::theme_nothing()
# Create vector predictions containing 0 for every transfer
predictions <- factor(rep(0, nrow(transfers)), levels = c(0, 1))
# Compute confusion matrix
caret::confusionMatrix(data = predictions, reference = transfers$fraud_flag)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 614 14
## 1 0 0
##
## Accuracy : 0.9777
## 95% CI : (0.9629, 0.9878)
## No Information Rate : 0.9777
## P-Value [Acc > NIR] : 0.570441
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.000512
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.9777
## Neg Pred Value : NaN
## Prevalence : 0.9777
## Detection Rate : 0.9777
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
# Compute cost of not detecting fraud
cost <- sum(transfers$amount[transfers$fraud_flag == 1])
print(cost)
## [1] 64410
# load("./RInputFiles/timestamps_circular.RData") # 'circular' num[1:25] ts
load("./RInputFiles/timestamps_digital.RData") # chr[1:25] timestamps
# Convert the plain text to hours
ts <- as.numeric(lubridate::hms(timestamps)) / 3600
# Convert the data to class circular
ts <- circular::circular(ts, units = "hours", template = "clock24")
# Plot a circular histogram
clock <- ggplot(data.frame(ts), aes(x = ts)) +
geom_histogram(breaks = seq(0, 24), colour = "blue", fill = "lightblue") +
coord_polar() +
scale_x_continuous("", limits = c(0, 24), breaks = seq(0, 24))
plot(clock)
# Create the von Mises distribution estimates
estimates <- circular::mle.vonmises(ts)
# Extract the periodic mean from the estimates
p_mean <- estimates$mu %% 24
# Add the periodic mean to the circular histogram
clock <- ggplot(data.frame(ts), aes(x = ts)) +
geom_histogram(breaks = seq(0, 24), colour = "blue", fill = "lightblue") +
coord_polar() +
scale_x_continuous("", limits = c(0, 24), breaks = seq(0, 24)) +
geom_vline(xintercept = as.numeric(p_mean), color = "red", linetype = 2, size = 1.5)
plot(clock)
# Estimate the periodic mean and concentration on the first 24 timestamps
alpha <- 0.95
p_mean <- estimates$mu %% 24
concentration <- estimates$kappa
# Estimate densities of all 25 timestamps
densities <- circular::dvonmises(ts, mu = p_mean, kappa = concentration)
# Check if the densities are larger than the cutoff of 95%-CI
cutoff <- circular::dvonmises(circular::qvonmises((1 - alpha)/2, mu = p_mean, kappa = concentration),
mu = p_mean, kappa = concentration
)
# Define the variable time_feature
time_feature <- densities >= cutoff
print(cbind.data.frame(ts, time_feature))
## ts time_feature
## 1 8.730000 TRUE
## 2 9.297778 TRUE
## 3 12.939444 TRUE
## 4 12.458889 TRUE
## 5 10.989722 TRUE
## 6 7.379167 TRUE
## 7 11.233056 TRUE
## 8 10.223889 TRUE
## 9 10.116944 TRUE
## 10 6.165556 TRUE
## 11 12.721389 TRUE
## 12 7.126389 TRUE
## 13 9.612222 TRUE
## 14 10.750000 TRUE
## 15 8.460000 TRUE
## 16 7.926389 TRUE
## 17 11.548889 TRUE
## 18 13.309722 TRUE
## 19 11.164167 TRUE
## 20 9.775833 TRUE
## 21 6.986667 TRUE
## 22 10.326667 TRUE
## 23 9.663056 TRUE
## 24 9.662778 TRUE
## 25 18.398333 FALSE
load("./RInputFiles/transfers_Bob.RData") # data.frame trans_Bob 17x12
# Frequency feature based on channel_cd
frequency_fun <- function(steps, channel) {
n <- length(steps)
frequency <- sum(channel[1:n] == channel[n+1])
return(frequency)
}
# Create freq_channel feature
freq_channel <- zoo::rollapply(trans_Bob$transfer_id, width = list(-1:-length(trans_Bob$transfer_id)),
partial = TRUE, FUN = frequency_fun, trans_Bob$channel_cd
)
# Print the features channel_cd, freq_channel and fraud_flag next to each other
freq_channel <- c(0, freq_channel)
cbind.data.frame(trans_Bob$channel_cd, freq_channel, trans_Bob$fraud_flag)
## trans_Bob$channel_cd freq_channel trans_Bob$fraud_flag
## 1 CH07 0 0
## 2 CH07 1 0
## 3 CH06 0 0
## 4 CH06 1 0
## 5 CH07 2 0
## 6 CH02 0 0
## 7 CH06 2 0
## 8 CH07 3 0
## 9 CH07 4 0
## 10 CH07 5 0
## 11 CH02 1 0
## 12 CH07 6 0
## 13 CH06 3 0
## 14 CH02 2 0
## 15 CH07 7 0
## 16 CH06 4 0
## 17 CH05 0 1
# load("./RInputFiles/transfers_AliceBob.RData") # data.frame trans 40x12
load("./RInputFiles/transfers_AliceBob_freq.RData") # data.frame trans 40x14
# Group the data
trans <- trans %>% group_by(account_name) %>%
# Mutate the data to add a new feature
mutate(freq_channel = c(0, zoo::rollapply(transfer_id, width = list(-1:-length(transfer_id)),
partial = TRUE, FUN = frequency_fun, channel_cd
)
)
)
# Print the features as columns next to each other
as.data.frame(trans %>% select(account_name, channel_cd, freq_channel, fraud_flag))
## account_name channel_cd freq_channel fraud_flag
## 1 Bob CH07 0 0
## 2 Alice CH04 0 0
## 3 Bob CH07 1 0
## 4 Bob CH06 0 0
## 5 Alice CH07 0 0
## 6 Bob CH06 1 0
## 7 Alice CH04 1 0
## 8 Bob CH07 2 0
## 9 Alice CH01 0 0
## 10 Bob CH02 0 0
## 11 Bob CH06 2 0
## 12 Alice CH03 0 0
## 13 Alice CH04 2 0
## 14 Alice CH04 3 0
## 15 Alice CH04 4 0
## 16 Bob CH07 3 0
## 17 Bob CH07 4 0
## 18 Bob CH07 5 0
## 19 Alice CH01 1 0
## 20 Alice CH04 5 0
## 21 Bob CH02 1 0
## 22 Alice CH04 6 0
## 23 Alice CH04 7 0
## 24 Bob CH07 6 0
## 25 Alice CH04 8 0
## 26 Bob CH06 3 0
## 27 Alice CH04 9 0
## 28 Alice CH04 10 0
## 29 Alice CH04 11 0
## 30 Bob CH02 2 0
## 31 Alice CH04 12 0
## 32 Alice CH01 2 0
## 33 Alice CH01 3 0
## 34 Bob CH07 7 0
## 35 Alice CH04 13 0
## 36 Alice CH04 14 0
## 37 Bob CH06 4 0
## 38 Alice CH04 15 0
## 39 Bob CH05 0 1
## 40 Alice CH05 0 1
# Create the recency function
recency_fun <- function(t, gamma, channel_cd, freq_channel) {
n_t <- length(t)
# If the channel has never been used, return 0 else, return the exponent
if (freq_channel[n_t] == 0) {
return(0)
} else {
time_diff <- t[1] - max(t[2:n_t][channel_cd[(n_t-1):1] == channel_cd[n_t]])
exponent <- -gamma * time_diff
return(exp(exponent))
}
}
# Group, mutate and rollapply
gamma <- -log(0.01)/90
trans <- trans %>%
group_by(account_name) %>%
mutate(rec_channel = zoo::rollapply(timestamp, width = list(0:-length(transfer_id)),
partial = TRUE, FUN = recency_fun,
gamma, channel_cd, freq_channel
)
)
# Print a new dataframe
as.data.frame(trans %>%
select(account_name, channel_cd, timestamp, rec_channel, fraud_flag)
)
## account_name channel_cd timestamp rec_channel fraud_flag
## 1 Bob CH07 3823030 0.0000000000 0
## 2 Alice CH04 4675604 0.0000000000 0
## 3 Bob CH07 4963503 0.0000000000 0
## 4 Bob CH06 5554880 0.0000000000 0
## 5 Alice CH07 5554975 0.0000000000 0
## 6 Bob CH06 5555011 0.0012271252 0
## 7 Alice CH04 6069408 0.0000000000 0
## 8 Bob CH07 6069430 0.0000000000 0
## 9 Alice CH01 6400316 0.0000000000 0
## 10 Bob CH02 6400428 0.0000000000 0
## 11 Bob CH06 6400499 0.0000000000 0
## 12 Alice CH03 7251861 0.0000000000 0
## 13 Alice CH04 8312166 0.0000000000 0
## 14 Alice CH04 8312333 0.0001944862 0
## 15 Alice CH04 8488444 0.0000000000 0
## 16 Bob CH07 9440864 0.0000000000 0
## 17 Bob CH07 10704152 0.0000000000 0
## 18 Bob CH07 13473757 0.0000000000 0
## 19 Alice CH01 14775404 0.0000000000 0
## 20 Alice CH04 19606163 0.0000000000 0
## 21 Bob CH02 24787430 0.0000000000 0
## 22 Alice CH04 24787532 0.0000000000 0
## 23 Alice CH04 24787661 0.0013593564 0
## 24 Bob CH07 24989386 0.0000000000 0
## 25 Alice CH04 24989464 0.0000000000 0
## 26 Bob CH06 27471273 0.0000000000 0
## 27 Alice CH04 29049288 0.0000000000 0
## 28 Alice CH04 30230242 0.0000000000 0
## 29 Alice CH04 30230331 0.0105250029 0
## 30 Bob CH02 32397147 0.0000000000 0
## 31 Alice CH04 32397147 0.0000000000 0
## 32 Alice CH01 32397148 0.0000000000 0
## 33 Alice CH01 32737015 0.0000000000 0
## 34 Bob CH07 35169990 0.0000000000 0
## 35 Alice CH04 35178888 0.0000000000 0
## 36 Alice CH04 35178952 0.0378248991 0
## 37 Bob CH06 35179134 0.0000000000 0
## 38 Alice CH04 36302300 0.0000000000 0
## 39 Bob CH05 38132166 0.0000000000 1
## 40 Alice CH05 38296341 0.0000000000 1
load("./RInputFiles/transfers_chap1_L4.RData") # data.frame transfers 222x16
# Statistics of frequency & recency features of legitimate transactions:
summary(transfers %>%
filter(fraud_flag==0) %>%
select(freq_channel, freq_auth, rec_channel, rec_auth)
)
## freq_channel freq_auth rec_channel rec_auth
## Min. : 0.00 Min. : 0.00 Min. :0.0000 Min. :0.0000
## 1st Qu.: 6.25 1st Qu.: 4.00 1st Qu.:0.7083 1st Qu.:0.5506
## Median : 28.50 Median :12.00 Median :0.8894 Median :0.8462
## Mean : 45.52 Mean :20.29 Mean :0.7766 Mean :0.7176
## 3rd Qu.: 82.75 3rd Qu.:26.75 3rd Qu.:0.9999 3rd Qu.:0.9999
## Max. :137.00 Max. :81.00 Max. :1.0000 Max. :1.0000
# Statistics of frequency & recency features of fraudulent transactions:
summary(transfers %>%
filter(fraud_flag==1) %>%
select(freq_channel, freq_auth, rec_channel, rec_auth)
)
## freq_channel freq_auth rec_channel rec_auth
## Min. : 0.0 Min. : 1.00 Min. :0.00000 Min. :0.01699
## 1st Qu.: 0.0 1st Qu.: 4.00 1st Qu.:0.00000 1st Qu.:0.02704
## Median : 2.0 Median : 6.00 Median :0.02124 Median :0.03644
## Mean : 3.5 Mean : 9.75 Mean :0.03705 Mean :0.24854
## 3rd Qu.: 5.5 3rd Qu.:11.75 3rd Qu.:0.05830 3rd Qu.:0.25794
## Max. :10.0 Max. :26.00 Max. :0.10573 Max. :0.90430
Chapter 2 - Social Network Analytics
Social network analytics:
Fraud and social network analytics:
Social network based inference:
Social network metrics:
Example code includes:
load("./RInputFiles/network_data.RData") # data.frame transfers 60x6 and data.frame account_info 38x2
library(igraph)
# Have a look at the data
head(transfers)
## originator beneficiary amount time benef_country payment_channel
## 1 I47 I87 1463.72327 15:12 CAN CHAN_01
## 2 I40 I61 143.26357 15:40 GBR CHAN_01
## 3 I89 I61 53.32169 11:44 GBR CHAN_05
## 4 I24 I52 226.27792 14:55 GBR CHAN_03
## 5 I40 I87 1151.49827 21:20 CAN CHAN_03
## 6 I63 I54 110.43880 20:21 GBR CHAN_03
# Create an undirected network from the dataset
net <- graph_from_data_frame(transfers, directed = FALSE)
# Plot the network with the vertex labels in bold and black
plot(net, vertex.label.color = "black", vertex.label.font = 2)
load("./RInputFiles/network_data_simple.RData") # data.frame edges 16x2
# Create a network from the data frame
net <- graph_from_data_frame(edges, directed = FALSE)
# Plot the network with the multiple edges
plot(net, layout = layout_in_circle)
# Specify new edge attributes width and curved
E(net)$width <- count.multiple(net)
E(net)$curved <- FALSE
# Check the new edge attributes and plot the network with overlapping edges
edge_attr(net)
## $width
## [1] 7 7 7 7 7 7 7 1 1 1 4 4 4 4 1 1
##
## $curved
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE
plot(net, layout = layout_in_circle)
# Create an undirected network from the dataset
net <- graph_from_data_frame(transfers, directed = FALSE)
# Add account_type as an attribute to the nodes of the network
V(net)$account_type <- account_info$type
# Have a look at the vertex attributes
print(vertex_attr(net))
## $name
## [1] "I47" "I40" "I89" "I24" "I63" "I28" "I44" "I23" "I41" "I93" "I52" "I25"
## [13] "I69" "I15" "I21" "I77" "I76" "I17" "I81" "I37" "I11" "I87" "I61" "I54"
## [25] "I80" "I20" "I64" "I46" "I19" "I55" "I14" "I30" "I29" "I35" "I27" "I60"
## [37] "I22" "I66"
##
## $account_type
## [1] 3 3 1 2 2 1 1 1 3 2 2 1 2 1 2 1 2 1 1 3 3 3 1 1 1 3 2 2 2 1 1 2 2 2 2 1 1 2
# Check for homophily based on account_type
assortativity_nominal(net, types = V(net)$account_type, directed = FALSE)
## [1] 0.1810621
# Each account type is assigned a color
vertex_colors <- c("grey", "lightblue", "darkorange")
# Add attribute color to V(net) which holds the color of each node depending on its account_type
V(net)$color <- vertex_colors[V(net)$account_type]
# Plot the network
plot(net)
load("./RInputFiles/network_data_v2.RData") # data.frame transfers 60x6 and data.frame account_info 38x3
# From data frame to graph
net <- graph_from_data_frame(transfers, directed = FALSE)
# Plot the network; color nodes according to isMoneyMule-variable
V(net)$color <- ifelse(account_info$isMoneyMule, "darkorange", "slateblue1")
plot(net, vertex.label.color = "black", vertex.label.font = 2, vertex.size = 18)
# Find the id of the money mule accounts
print(account_info$id[account_info$isMoneyMule])
## [1] I47 I41 I87 I20
## 38 Levels: I11 I14 I15 I17 I19 I20 I21 I22 I23 I24 I25 I27 I28 I29 I30 ... I93
# Create subgraph containing node "I41" and all money mules nodes
subnet <- induced_subgraph(net, v = c("I41", "I47", "I87", "I20"))
# Compute the money mule probability of node "I41" based on the neighbors
strength(subnet, v="I41") / strength(net, v="I41")
## I41
## 0.6
load("./RInputFiles/kite.RData") # list[1:10] kite
# Plot network kite
plot(kite)
# Find the degree of each node
degree(kite)
## [1] 4 4 3 6 3 5 5 3 2 1
# Which node has the largest degree?
which.max(degree(kite))
## [1] 4
# Plot kite with vertex.size proportional to the degree of each node
plot(kite, vertex.size = 6 * degree(kite))
# Find the closeness of each node
closeness(kite)
## [1] 0.05882353 0.05882353 0.05555556 0.06666667 0.05555556 0.07142857
## [7] 0.07142857 0.06666667 0.04761905 0.03448276
# Which node has the largest closeness?
which.max(closeness(kite))
## [1] 6
# Plot kite with vertex.size proportional to the closeness of each node
plot(kite, vertex.size = 500 * closeness(kite))
# Find the betweenness of each node
betweenness(kite)
## [1] 0.8333333 0.8333333 0.0000000 3.6666667 0.0000000 8.3333333
## [7] 8.3333333 14.0000000 8.0000000 0.0000000
# Which node has the largest betweenness?
which.max(betweenness(kite))
## [1] 8
# Plot kite with vertex.size proportional to the betweenness of each node
plot(kite, vertex.size = 5 * betweenness(kite))
# Plot network and print account info
plot(net)
legend("bottomleft", legend = c("known money mule", "legit account"),
fill = c("darkorange", "lightblue"), bty = "n"
)
print(account_info)
## id isMoneyMule type
## 1 I47 TRUE 3
## 2 I40 FALSE 3
## 3 I89 FALSE 1
## 4 I24 FALSE 2
## 5 I63 FALSE 2
## 6 I28 FALSE 1
## 7 I44 FALSE 1
## 8 I23 FALSE 1
## 9 I41 TRUE 3
## 10 I93 FALSE 2
## 11 I52 FALSE 2
## 12 I25 FALSE 1
## 13 I69 FALSE 2
## 14 I15 FALSE 1
## 15 I21 FALSE 2
## 16 I77 FALSE 1
## 17 I76 FALSE 2
## 18 I17 FALSE 1
## 19 I81 FALSE 1
## 20 I37 FALSE 3
## 21 I11 FALSE 3
## 22 I87 TRUE 3
## 23 I61 FALSE 1
## 24 I54 FALSE 1
## 25 I80 FALSE 1
## 26 I20 TRUE 3
## 27 I64 FALSE 2
## 28 I46 FALSE 2
## 29 I19 FALSE 2
## 30 I55 FALSE 1
## 31 I14 FALSE 1
## 32 I30 FALSE 2
## 33 I29 FALSE 2
## 34 I35 FALSE 2
## 35 I27 FALSE 2
## 36 I60 FALSE 1
## 37 I22 FALSE 1
## 38 I66 FALSE 2
# Degree
account_info$degree <- degree(net, normalized = TRUE)
# Closeness
account_info$closeness <- closeness(net, normalized = TRUE)
# Betweenness
account_info$betweenness <- betweenness(net, normalized = TRUE)
print(account_info)
## id isMoneyMule type degree closeness betweenness
## 1 I47 TRUE 3 0.08108108 0.3775510 0.0000000000
## 2 I40 FALSE 3 0.16216216 0.3425926 0.1979479479
## 3 I89 FALSE 1 0.05405405 0.2587413 0.0000000000
## 4 I24 FALSE 2 0.18918919 0.3737374 0.2600100100
## 5 I63 FALSE 2 0.13513514 0.2983871 0.0350350350
## 6 I28 FALSE 1 0.21621622 0.3627451 0.2942942943
## 7 I44 FALSE 1 0.18918919 0.3663366 0.2209709710
## 8 I23 FALSE 1 0.21621622 0.3592233 0.2757757758
## 9 I41 TRUE 3 0.13513514 0.4352941 0.2817817818
## 10 I93 FALSE 2 0.08108108 0.3274336 0.0838338338
## 11 I52 FALSE 2 0.05405405 0.2846154 0.0000000000
## 12 I25 FALSE 1 0.08108108 0.3008130 0.0347847848
## 13 I69 FALSE 2 0.08108108 0.2781955 0.0000000000
## 14 I15 FALSE 1 0.08108108 0.2720588 0.0007507508
## 15 I21 FALSE 2 0.05405405 0.2700730 0.0000000000
## 16 I77 FALSE 1 0.08108108 0.2700730 0.0000000000
## 17 I76 FALSE 2 0.08108108 0.2700730 0.0000000000
## 18 I17 FALSE 1 0.05405405 0.3032787 0.0315315315
## 19 I81 FALSE 1 0.08108108 0.2720588 0.0007507508
## 20 I37 FALSE 3 0.05405405 0.3217391 0.0155155155
## 21 I11 FALSE 3 0.08108108 0.3663366 0.0950950951
## 22 I87 TRUE 3 0.16216216 0.4352941 0.3753753754
## 23 I61 FALSE 1 0.05405405 0.2587413 0.0000000000
## 24 I54 FALSE 1 0.05405405 0.2761194 0.0000000000
## 25 I80 FALSE 1 0.02702703 0.2569444 0.0000000000
## 26 I20 TRUE 3 0.13513514 0.4157303 0.2309809810
## 27 I64 FALSE 2 0.08108108 0.3057851 0.0385385385
## 28 I46 FALSE 2 0.05405405 0.2700730 0.0000000000
## 29 I19 FALSE 2 0.02702703 0.2661871 0.0000000000
## 30 I55 FALSE 1 0.05405405 0.2700730 0.0000000000
## 31 I14 FALSE 1 0.08108108 0.2781955 0.0000000000
## 32 I30 FALSE 2 0.05405405 0.2700730 0.0000000000
## 33 I29 FALSE 2 0.08108108 0.2700730 0.0000000000
## 34 I35 FALSE 2 0.02702703 0.2700730 0.0000000000
## 35 I27 FALSE 2 0.02702703 0.2740741 0.0000000000
## 36 I60 FALSE 1 0.02702703 0.2661871 0.0000000000
## 37 I22 FALSE 1 0.02702703 0.2740741 0.0000000000
## 38 I66 FALSE 2 0.02702703 0.2740741 0.0000000000
Chapter 3 - Imbalanced Class Distributions
Dealing with imbalanced datasets:
Random under-sampling:
Synthetic minority over-sampling:
From dataset to detection model:
cost <- sum(true.classes * (1 - predicted.classes) * amounts + predicted.classes * fixedcost)return(cost) Example code includes:
load("./RInputFiles/transfers02_v2.RData") # data.frame transfers is 628x12
# Make a scatter plot
ggplot(transfers, aes(x = amount, y = orig_balance_before)) +
geom_point(aes(color = fraud_flag, shape = fraud_flag)) +
scale_color_manual(values = c('dodgerblue', 'red'))
load("./RInputFiles/creditcard5.RData") # data.frame creditcard is 9840x32
# Calculate the required number of cases in the over-sampled dataset
n_new <- sum(creditcard$Class==0) / (1-0.3333)
# Over-sample
oversampling_result <- ROSE::ovun.sample(formula = Class ~ ., data = creditcard,
method = "over", N = n_new, seed = 2018
)
# Verify the Class-balance of the over-sampled dataset
oversampled_credit <- oversampling_result$data
prop.table(table(oversampled_credit$Class))
##
## 0 1
## 0.6667142 0.3332858
# Calculate the required number of cases in the over-sampled dataset
n_new <- sum(creditcard$Class == 1) / (0.4)
# Under-sample
undersampling_result <- ROSE::ovun.sample(formula = Class ~ ., data = creditcard,
method = "under", N = n_new, seed = 2018
)
# Verify the Class-balance of the under-sampled dataset
undersampled_credit <- undersampling_result$data
prop.table(table(undersampled_credit$Class))
##
## 0 1
## 0.6 0.4
# Specify the desired number of cases in the balanced dataset and the fraction of fraud cases
n_new <- 10000
fraud_fraction <- 0.3
# Combine ROS & RUS!
sampling_result <- ROSE::ovun.sample(formula = Class ~ ., data = creditcard, method = "both",
N = n_new, p = fraud_fraction, seed = 2018
)
# Verify the Class-balance of the re-balanced dataset
sampled_credit <- sampling_result$data
prop.table(table(sampled_credit$Class))
##
## 0 1
## 0.6984 0.3016
# Set the number of fraud and legitimate cases, and the desired percentage of legitimate cases
n0 <- sum(creditcard$Class==0)
n1 <- sum(creditcard$Class==1)
r0 <- 0.6
# Calculate the value for the dup_size parameter of SMOTE
ntimes <- ((1 - r0) / r0) * (n0 / n1) - 1
# Create synthetic fraud cases with SMOTE
smote_output <- smotefamily::SMOTE(X = creditcard[ , -c(1, 31, 32)], target = creditcard$Class,
K = 5, dup_size = ntimes
)
# Make a scatter plot of the original and over-sampled dataset
credit_smote <- smote_output$data
colnames(credit_smote)[30] <- "Class"
prop.table(table(credit_smote$Class))
##
## 0 1
## 0.6129032 0.3870968
ggplot(creditcard, aes(x = V1, y = V2, color = Class)) +
geom_point() +
scale_color_manual(values = c('dodgerblue2', 'red'))
ggplot(credit_smote, aes(x = V1, y = V2, color = Class)) +
geom_point() +
scale_color_manual(values = c('dodgerblue2', 'red'))
set.seed(1903172344)
testIdx <- sort(sample(1:nrow(creditcard), round(0.5*nrow(creditcard)), replace=FALSE))
test <- creditcard[testIdx, ]
train_original <- creditcard[-testIdx, ]
n_new <- 7380
fraud_fraction <- 0.3
train_oversampled <- ROSE::ovun.sample(formula = Class ~ ., data = train_original,
method = "over", N = n_new, seed = 2018
)$data
# Train the rpart algorithm on the original training set and the SMOTE-rebalanced training set
model_orig <- rpart::rpart(Class ~ ., data = train_original)
model_smote <- rpart::rpart(Class ~ ., data = train_oversampled)
# Predict the fraud probabilities of the test cases
scores_orig <- predict(model_orig, newdata = test, type = "prob")[, 2]
scores_smote <- predict(model_smote, newdata = test, type = "prob")[, 2]
# Convert the probabilities to classes (0 or 1) using a cutoff value
predicted_class_orig <- factor(ifelse(scores_orig > 0.5, 1, 0))
predicted_class_smote <- factor(ifelse(scores_smote > 0.5, 1, 0))
# Determine the confusion matrices and the model's accuracy
CM_orig <- caret::confusionMatrix(data = predicted_class_orig, reference = test$Class)
CM_smote <- caret::confusionMatrix(data = predicted_class_smote, reference = test$Class)
print(CM_orig$table)
## Reference
## Prediction 0 1
## 0 4666 47
## 1 3 204
print(CM_orig$overall[1])
## Accuracy
## 0.9898374
print(CM_smote$table)
## Reference
## Prediction 0 1
## 0 4577 38
## 1 92 213
print(CM_smote$overall[1])
## Accuracy
## 0.9735772
cost_model <- function(predicted.classes, true.classes, amounts, fixedcost) {
predicted.classes <- hmeasure::relabel(predicted.classes)
true.classes <- hmeasure::relabel(true.classes)
cost <- sum(true.classes * (1 - predicted.classes) * amounts + predicted.classes * fixedcost)
return(cost)
}
# Calculate the total cost of deploying the original model
cost_model(predicted_class_orig, test$Class, test$Amount, fixedcost=10)
## [1] 9923.82
# Calculate the total cost of deploying the model using SMOTE
cost_model(predicted_class_smote, test$Class, test$Amount, fixedcost=10)
## [1] 11391.11
Chapter 4 - Digit Analysis and Robust Statistics
Digit analysis using Benford’s Law:
Benford’s Law for fraud detection:
Detecting univariate outliers:
stat_boxplot(geom = "errorbar", width = 0.2, coef = 1.5*exp(3*mc(los))) + geom_boxplot(ymin = adjbox_stats[1], ymax = adjbox_stats[5], middle = adjbox_stats[3], upper = adjbox_stats[4], lower = adjbox_stats[2], outlier.shape = NA, fill = "lightblue", width = 0.5) + geom_point(data=subset(data.frame(los), los < adjbox_stats[1] | los > adjbox_stats[5]), col = "red", size = 3, shape = 16) + xlab("") + ylab("Length Of Stay (LOS)") + theme(text = element_text(size = 25)) Detecting multivariate outliers:
Example code includes:
# Implement Benford's Law for first digit
benlaw <- function(d) log10(1 + 1 / d)
# Calculate expected frequency for d=5
benlaw(d=5)
## [1] 0.07918125
# Create a dataframe of the 9 digits and their Benford's Law probabilities
df <- data.frame(digit = 1:9, probability = benlaw(1:9))
# Create barplot with expected frequencies
ggplot(df, aes(x = digit, y = probability)) +
geom_bar(stat = "identity", fill = "dodgerblue") +
xlab("First digit") +
ylab("Expected frequency") +
scale_x_continuous(breaks = 1:9, labels = 1:9) +
ylim(0, 0.33) +
theme(text = element_text(size = 25))
data(census.2009, package="benford.analysis")
# Check conformity
bfd.cen <- benford.analysis::benford(census.2009$pop.2009, number.of.digits = 1)
plot(bfd.cen, except = c("second order", "summation", "mantissa", "chi squared",
"abs diff", "ex summation", "Legend"),
multiple = F
)
# Multiply the data by 3 and check conformity again
data <- census.2009$pop.2009 * 3
bfd.cen3 <- benford.analysis::benford(data, number.of.digits=1)
plot(bfd.cen3, except = c("second order", "summation", "mantissa", "chi squared",
"abs diff", "ex summation", "Legend"),
multiple = F
)
load("./RInputFiles/fireinsuranceclaims.RData") # num[1:40000] fireinsuranceclaims
# Validate data against Benford's Law using first digit
bfd.ins <- benford.analysis::benford(fireinsuranceclaims, number.of.digits = 1)
plot(bfd.ins, except=c("second order", "summation", "mantissa", "chi squared",
"abs diff", "ex summation", "Legend"),
multiple = F
)
# Validate data against Benford's Law using first-two digits
bfd.ins2 <- benford.analysis::benford(fireinsuranceclaims, number.of.digits = 2)
plot(bfd.ins2, except=c("second order", "summation", "mantissa", "chi squared",
"abs diff", "ex summation", "Legend"),
multiple = F
)
load("./RInputFiles/expensesCEO.RData") # num[1:988] expensesCEO
# Validate data against Benford's Law using first digit
bfd.exp <- benford.analysis::benford(expensesCEO, number.of.digits = 1)
plot(bfd.exp, except=c("second order", "summation", "mantissa", "chi squared",
"abs diff", "ex summation", "Legend"),
multiple = F
)
# Validate data against Benford's Law using first-two digits
bfd.exp2 <- benford.analysis::benford(expensesCEO, number.of.digits = 2)
plot(bfd.exp2, except=c("second order", "summation", "mantissa", "chi squared",
"abs diff", "ex summation", "Legend"),
multiple = F
)
load("./RInputFiles/transfers_chap1_L4.RData") # data.frame transfers 222x16
# Get observations identified as fraud
which(transfers$fraud_flag == 1)
## [1] 71 198 220 222
# Compute median and mean absolute deviation for `amount`
m <- median(transfers$amount)
s <- mad(transfers$amount)
# Compute robust z-score for each observation
robzscore <- abs((transfers$amount - m) / (s))
# Get observations with robust z-score higher than 3 in absolute value
which(abs(robzscore) > 3)
## [1] 1 7 14 18 31 36 43 44 48 49 50 53 63 71 79 91 112 113 116
## [20] 122 123 124 132 152 153 162 168 170 177 191 198 205 214 219 220 222
thexp <- c(40517, 33541, 5182, 40385, 40302, 23189, 13503, 5110, 15754, 40763, 23061, 30839, 25206, 15891, 38821, 11766, 4934, 13754, 14142, 27813, 21005, 11511, 41750, 32855, 62043, 19415, 51815, 26961, 19185, 19704, 21831, 40768, 49079, 12766, 13030, 8841, 17943, 6214, 21114, 7898, 30707, 69698, 70155, 15032, 55858, 31747, 11562, 12390, 7016, 96396, 24614, 22735, 20483, 36907, 31822, 13619, 34401, 10281, 32165, 52226, 13941, 40850, 15270, 21143, 26029, 10209, 10950, 19745, 54153, 33668, 7562, 34231, 34219, 25784, 52952, 32959, 17459, 25611, 14998, 36229, 26485, 20563, 41865, 29821, 26792, 42406, 20083, 10205, 31353, 33674, 13523, 51835, 18136, 54736, 33499, 95389, 44967, 67707, 40879, 17729, 15643, 15648, 19150, 9789, 27978, 40469, 30696, 48195, 12817, 10527, 42946, 72281, 13773, 17189, 14340, 47962, 29063, 34477, 84354, 37943, 13584, 12184, 49563, 36263, 18313, 25399, 50235, 14230, 25617, 18226, 31542, 24262, 17617, 22068, 43534, 14574, 6471, 57500, 8535, 85065, 22749, 10481, 42094, 24436, 27975, 28347, 32929, 20106, 30992, 22202, 17005, 29900, 16871, 10790, 38355, 10315, 7782, 16084, 11788, 20005, 70859, 21706, 11929, 69816, 6351, 27217, 30178, 10597, 13715, 13687, 17116, 27426, 56579, 31655, 86577, 27051, 10477, 11178, 49785, 12626, 44817, 15758, 21396, 5590, 40538, 38834, 32693, 47330, 17823, 92957, 44439, 27188, 22972, 40020, 33067, 24562, 8408, 36088, 8823, 8022, 36395, 14523, 49188, 19744, 17536, 32456, 38400, 6451, 31766, 24727, 60013, 15664, 17356, 50482, 20752, 28048, 10932, 35337, 18755, 6572, 16065, 67257, 36303, 14846, 50468, 27237, 7165, 38067, 22040, 23794, 106032, 19303, 63934, 16818, 11621, 40566, 14921, 15188, 28087, 21026, 38907, 39727, 49794, 49112, 6886, 31674, 25053, 23835, 12160, 45640, 7898, 107065, 58206, 8270, 69529, 25776, 57742, 18456, 53523, 27514, 14089, 7291, 4727, 7319, 22650, 8462, 14980, 39085, 13627, 14998, 29686, 22750, 25487, 40127, 22844, 7597, 19265, 14869, 33010, 74958, 30320, 16602, 36376, 16467, 26946, 26870, 33433, 61134, 20121, 58389, 61594, 24150, 32594, 20177, 3133, 22330, 16905, 25053, 10837, 20807, 6647, 29696, 34457, 78286, 21551, 26533, 52237, 58745, 19607, 25062, 23823, 108756, 12067, 28813, 17803, 35795, 20860, 14307, 11991, 71409, 74111, 25916, 25914, 14092, 24780, 43417, 12767, 43919, 28205, 34075, 68173, 28509, 78760, 52329, 31858, 54088, 6670, 29371, 30066, 16554, 13866, 13806, 40504, 49841, 19729, 30881, 35484, 11373, 10624, 7544, 49465, 12499, 18316, 27963, 31601, 52243, 48927, 42339, 34707, 13034, 9452, 69461, 64335, 9964, 8993, 11217, 116262, 16693, 28622, 35251, 8587, 28414, 15552, 49460, 57721, 16398, 20597, 28592, 13795, 19534, 37065, 22847, 10679, 37552, 63611, 12023, 19659, 16040, 63519, 56897, 55381, 44055, 65130, 8251, 4857, 6225, 74197, 20986, 47412, 35014, 39263, 19486, 22648, 7826, 52697, 9196, 31890, 8836, 62723, 60814, 42018, 20671, 7444, 21584, 13213, 23959, 16394, 21742, 71587, 16866, 63788, 39375, 37087, 9370, 37440, 10666, 60304, 34391, 55988, 9921, 11709, 69198, 49491, 39201, 10065, 32063, 10380, 18818, 35576, 15695, 30336, 74993, 48089, 24255, 56063, 11932, 23251, 9537, 7757, 67473, 44949, 11842, 12681, 36156, 71957, 37576, 36120, 53232, 6743, 36142, 62565, 49525, 33161, 28272, 24179, 15227, 46151, 32764, 32374, 22462, 9593, 9442, 13202, 24634, 35284, 44800, 11044, 45655, 5502, 18124, 18007, 39303, 14012, 26558, 10926, 25389, 16447, 8720, 12977, 12942, 25020, 6430, 18919, 24916, 55667, 51081, 99834, 31840, 147498)
thexp <- c(thexp, 27913, 40044, 24278, 41614, 50038, 54307, 27598, 18022, 34789, 24503, 51856, 12160, 19107, 40845, 46171, 66460, 22241, 15245, 10925, 17671, 5666, 25305, 9394, 20357, 32948, 7942, 14555, 24012, 25235, 17292, 81646, 46737, 15012, 49861, 10012, 15076, 7693, 18513, 46293, 50770, 36122, 25598, 8633, 15568, 16954, 47036, 38076, 18458, 8092, 38576, 28692, 27211, 37485, 15162, 32968, 55021, 7060, 16714, 34581, 14939, 27056, 15090, 56905, 29528, 21282, 39487, 24239, 35466, 21982, 10334, 15133, 41591, 23260, 12882, 32149, 16219, 41605, 8346, 8549, 23818, 36217, 42766, 11239, 59532, 31806, 52218, 73118, 31701, 32761, 18745, 17949, 8017, 12833, 25583, 36468, 8706, 94587, 42900, 74298, 17201, 25618, 14888, 16308, 75043, 68056, 50797, 15956, 13820, 13985, 22742, 17692, 30214, 57582, 17273, 31885, 14307, 22597, 46389, 23366, 35128, 51769, 9251, 35663, 34474, 18748, 60091, 31137, 14366, 25347, 32175, 16065, 16672, 45192, 42039, 19665, 24933, 29570, 23400, 13517, 23993, 18140, 9545, 16042, 24425, 28400, 25035, 28316, 19001, 27203, 8016, 15199, 14069, 12037, 30455, 13877, 10696, 11010, 41384, 37241, 38328, 54434, 27174, 14015, 27354, 12944, 19718, 21558, 22239, 31076, 32940, 17810, 13462, 16122, 20417, 36205, 9871, 11892, 50737, 32511, 29767, 17032, 40276, 24005, 33884, 16278, 11326, 7187, 50434, 70436, 38353, 27723, 32811, 14833, 28465, 83158, 18866, 21823, 39125, 45372, 33933, 29469, 79147, 24190, 37007, 4259, 41346, 25087, 27216, 32780, 21190, 29067, 11316, 36103, 15389, 27257, 26051, 22853, 10551, 6661, 15878, 17131, 18220, 12045, 10573, 34645, 19517, 13933, 14452, 33694, 35605, 48376, 19567, 35762, 12931, 6286, 25321, 22167, 24243, 19433, 26852, 25802, 26647, 26423, 11537, 16011, 56547, 22690, 20391, 10487, 16994, 25690, 19260, 57525, 17802, 28135, 14365, 21640, 20817, 55771, 31693, 31859, 27496, 39715, 22775, 27933, 58875, 39133, 7604, 29409, 29296, 44377, 33107, 21235, 59129, 33427, 10164, 14595, 4744, 49674, 27827, 48830, 36196, 24979, 47800, 108752, 52300, 38343, 19381, 35881, 43688, 32938, 13341, 29297, 38603, 30202, 14797, 38529, 29055, 61303, 27109, 53496, 16665, 65132, 23903, 36096, 21247, 42292, 11176, 7542, 15210, 5289, 58444, 33295, 30456, 60595, 59624, 19642, 13317, 9262, 17611, 35079, 45469, 59510, 26852, 51484, 20195, 27751, 33555, 27692, 70407, 18102, 130773, 16637, 60463, 11653, 19275, 47114, 6117, 29645, 57846, 51033, 11790, 24970, 32391, 19278, 27778, 19596, 17761, 56884, 66230, 40617, 80495, 27704, 22815, 23390, 18092, 13037, 27954, 6979, 6942, 46155, 34240, 24484, 22375, 45916, 32788, 28017, 31922, 25357, 7314)
# Create boxplot
bp.thexp <- boxplot(thexp, col = "lightblue", main = "Standard boxplot",
ylab = "Total household expenditure"
)
# Extract the outliers from the data
bp.thexp$out
## [1] 96396 95389 84354 85065 86577 92957 106032 107065 74958 78286
## [11] 108756 74111 78760 116262 74197 74993 99834 147498 81646 94587
## [21] 74298 75043 83158 79147 108752 130773 80495
# Create adjusted boxplot
adj.thexp <- robustbase::adjbox(thexp, col = "lightblue", main = "Adjusted boxplot",
ylab = "Total household expenditure"
)
load("./RInputFiles/hailinsurance.RData") # matrix num[1:100, 1:2] hailinsurance
# Create a scatterplot
plot(hailinsurance, xlab = "price house", ylab = "claim")
# Compute the sample mean and sample covariance matrix
clcenter <- colMeans(hailinsurance)
clcov <- cov(hailinsurance)
# Add 97.5% tolerance ellipsoid
rad <- sqrt(qchisq(0.975, df=ncol(hailinsurance)))
car::ellipse(center = clcenter, shape = clcov, radius = rad,col = "blue", lty = 2)
# Create a scatterplot of the data
plot(hailinsurance, xlab = "price house", ylab = "claim")
# Compute robust estimates for location and scatter
mcdresult <- robustbase::covMcd(hailinsurance)
robustcenter <- mcdresult$center
robustcov <- mcdresult$cov
# Add robust 97.5% tolerance ellipsoid
rad <- sqrt(qchisq(0.975, df=ncol(hailinsurance)))
car::ellipse(center = robustcenter, shape = robustcov, radius = rad, col = "red")
Chapter 1 - Principal Component Analysis
The curse of dimensionality:
Getting PCA to work with FactoMineR:
Interpreting and visualizing PCA models:
Example code includes:
cars <- as.data.frame(data.table::fread("./RInputFiles/04carsdata.csv"))
rowsDelete <- c(59, 65, 71, 83, 84, 85, 108, 109, 116, 119, 124, 127, 128, 138, 143, 146, 147, 148, 205, 239, 240, 244, 245, 247, 248, 256, 291, 293, 295, 296, 297, 304, 315, 321, 325, 355, 399, 400, 401, 414, 415)
cars$`Vehicle Name`[rowsDelete]
## [1] "Cadillac Escalade EXT"
## [2] "Chevrolet Avalanche 1500"
## [3] "Chevrolet Colorado Z85"
## [4] "Chevrolet Silverado 1500 Regular Cab"
## [5] "Chevrolet Silverado SS"
## [6] "Chevrolet SSR"
## [7] "Dodge Dakota Club Cab"
## [8] "Dodge Dakota Regular Cab"
## [9] "Dodge Ram 1500 Regular Cab ST"
## [10] "Dodge Viper SRT-10 convertible 2dr"
## [11] "Ford Excursion 6.8 XLT"
## [12] "Ford F-150 Regular Cab XL"
## [13] "Ford F-150 Supercab Lariat"
## [14] "Ford Ranger 2.3 XL Regular Cab"
## [15] "GMC Canyon Z85 SL Regular Cab"
## [16] "GMC Sierra Extended Cab 1500"
## [17] "GMC Sierra HD 2500"
## [18] "GMC Sonoma Crew Cab"
## [19] "Kia Amanti 4dr"
## [20] "Mazda B2300 SX Regular Cab"
## [21] "Mazda B4000 SE Cab Plus"
## [22] "Mazda RX-8 4dr automatic"
## [23] "Mazda RX-8 4dr manual"
## [24] "Mazda3 i 4dr"
## [25] "Mazda3 s 4dr"
## [26] "Mercedes-Benz C320 4dr"
## [27] "Mitsubishi Galant ES 2.4L 4dr"
## [28] "Mitsubishi Lancer ES 4dr"
## [29] "Mitsubishi Lancer LS 4dr"
## [30] "Mitsubishi Lancer OZ Rally 4dr auto"
## [31] "Mitsubishi Lancer Sportback LS"
## [32] "Nissan Frontier King Cab XE V6"
## [33] "Nissan Titan King Cab XE"
## [34] "Pontiac Bonneville GXP 4dr"
## [35] "Pontiac GTO 2dr"
## [36] "Subaru Baja"
## [37] "Toyota Tacoma"
## [38] "Toyota Tundra Access Cab V6 SR5"
## [39] "Toyota Tundra Regular Cab V6"
## [40] "Volkswagen Phaeton 4dr"
## [41] "Volkswagen Phaeton W12 4dr"
rowsMod <- c(182, 183, 252, 253, 255, 256)
cars$`Vehicle Name`[rowsMod]
## [1] "Infiniti G35 4dr" "Infiniti G35 4dr" "Mercedes-Benz C240 4dr"
## [4] "Mercedes-Benz C240 4dr" "Mercedes-Benz C320 4dr" "Mercedes-Benz C320 4dr"
rowNames <- cars$`Vehicle Name`
rowNames[182] <- paste0(rowNames[182], " RWD")
rowNames[183] <- paste0(rowNames[183], " AWD")
rowNames[252] <- paste0(rowNames[252], " RWD")
rowNames[253] <- paste0(rowNames[253], " AWD")
rowNames[255] <- paste0(rowNames[255], " RWD")
rowNames[256] <- paste0(rowNames[256], " AWD")
row.names(cars) <- rowNames
cars <- cars[-rowsDelete, ]
cars$`Vehicle Name` <- NULL
cars$type <- factor(c(3, 3, 5, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 6, 3, 3, 3, 4, 6, 3, 4, 4, 4, 3, 3, 3, 3, 6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 5, 5, 4, 4, 3, 3, 3, 3, 3, 5, 3, 3, 5, 3, 3, 3, 5, 3, 5, 4, 1, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 6, 3, 3, 5, 5, 5, 5, 1, 3, 3, 3, 3, 3, 4, 6, 3, 3, 3, 3, 3, 3, 1, 1, 5, 1, 5, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5, 5, 3, 3, 3, 6, 3, 3, 1, 4, 4, 3, 6, 3, 4, 5, 1, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5, 3, 1, 1, 5, 4, 5, 3, 3, 3, 3, 3, 3, 5, 3, 3, 4, 3, 3, 6, 6, 3, 3, 3, 3, 3, 3, 5, 5, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 5, 5, 5, 3, 3, 3, 3, 6, 1, 5, 3, 3, 3, 5, 5, 5, 3, 3, 3, 5, 3, 3, 6, 3, 5, 5, 4, 5, 3, 3, 3, 3, 5, 3, 3, 3, 1, 4, 4, 5, 3, 3, 6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 6, 3, 6, 3, 5, 5, 3, 3, 4, 4, 4, 4, 4, 3, 3, 3, 3, 1, 5, 6, 3, 3, 3, 3, 3, 4, 4, 5, 3, 4, 5, 5, 4, 4, 3, 3, 3, 3, 6, 5, 5, 1, 1, 3, 3, 3, 5, 3, 3, 1, 5, 3, 3, 3, 1, 1, 3, 3, 6, 4, 4, 4, 4, 4, 4, 5, 3, 3, 3, 3, 6, 3, 3, 3, 6, 3, 3, 3, 3, 3, 5, 3, 6, 6, 3, 4, 4, 3, 3, 6, 3, 3, 3, 3, 3, 6, 3, 3, 3, 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 5, 5, 6, 4, 3, 5, 5, 1, 1, 3, 3, 6, 3, 3, 3, 3, 6, 3, 3, 6, 3, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 6, 6, 5), levels=c(1, 2, 3, 4, 5, 6), labels=c('Minivan', 'Pickup', 'Small.Sporty..Compact.Large.Sedan', 'Sports.Car', 'SUV', 'Wagon'))
cars$wheeltype <- factor(c(1, 2, 1, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1), levels=c(1, 2), labels=c("AWD", "RWD"))
colNames <- c('Small.Sporty..Compact.Large.Sedan', 'Sports.Car', 'SUV', 'Wagon', 'Minivan', 'Pickup', 'AWD', 'RWD', 'Retail.Price', 'Dealer.Cost', 'Engine.Size..l.', 'Cyl', 'HP', 'City.MPG', 'Hwy.MPG', 'Weight', 'Wheel.Base', 'Len', 'Width', 'type', 'wheeltype')
for (intCtr in seq_along(colNames)) {
cat("Original Name: ", names(cars)[intCtr], " ---> New Name: ", colNames[intCtr], "\n")
}
## Original Name: Small/Sporty/ Compact/Large Sedan ---> New Name: Small.Sporty..Compact.Large.Sedan
## Original Name: Sports Car ---> New Name: Sports.Car
## Original Name: SUV ---> New Name: SUV
## Original Name: Wagon ---> New Name: Wagon
## Original Name: Minivan ---> New Name: Minivan
## Original Name: Pickup ---> New Name: Pickup
## Original Name: AWD ---> New Name: AWD
## Original Name: RWD ---> New Name: RWD
## Original Name: Retail Price ---> New Name: Retail.Price
## Original Name: Dealer Cost ---> New Name: Dealer.Cost
## Original Name: Engine Size (l) ---> New Name: Engine.Size..l.
## Original Name: Cyl ---> New Name: Cyl
## Original Name: HP ---> New Name: HP
## Original Name: City MPG ---> New Name: City.MPG
## Original Name: Hwy MPG ---> New Name: Hwy.MPG
## Original Name: Weight ---> New Name: Weight
## Original Name: Wheel Base ---> New Name: Wheel.Base
## Original Name: Len ---> New Name: Len
## Original Name: Width ---> New Name: Width
## Original Name: type ---> New Name: type
## Original Name: wheeltype ---> New Name: wheeltype
names(cars) <- colNames
cars <- cars %>% mutate(City.MPG=as.integer(City.MPG), Hwy.MPG=as.integer(Hwy.MPG),
Weight=as.integer(Weight), Wheel.Base=as.integer(Wheel.Base),
Len=as.integer(Len), Width=as.integer(Width)
)
str(cars)
## 'data.frame': 387 obs. of 21 variables:
## $ Small.Sporty..Compact.Large.Sedan: int 1 1 0 0 1 1 1 1 1 1 ...
## $ Sports.Car : int 0 0 0 1 0 0 0 0 0 0 ...
## $ SUV : int 0 0 1 0 0 0 0 0 0 0 ...
## $ Wagon : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Minivan : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Pickup : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AWD : int 0 0 1 0 0 0 0 0 0 0 ...
## $ RWD : int 0 0 0 1 0 0 0 0 0 0 ...
## $ Retail.Price : int 43755 46100 36945 89765 23820 33195 26990 25940 31840 42490 ...
## $ Dealer.Cost : int 39014 41100 33337 79978 21761 30299 24647 23508 28846 38325 ...
## $ Engine.Size..l. : num 3.5 3.5 3.5 3.2 2 3.2 2.4 1.8 3 3 ...
## $ Cyl : int 6 6 6 6 4 6 4 4 6 6 ...
## $ HP : int 225 225 265 290 200 270 200 170 220 220 ...
## $ City.MPG : int 18 18 17 17 24 20 22 22 20 20 ...
## $ Hwy.MPG : int 24 24 23 24 31 28 29 31 28 27 ...
## $ Weight : int 3880 3893 4451 3153 2778 3575 3230 3252 3462 3814 ...
## $ Wheel.Base : int 115 115 106 100 101 108 105 104 104 105 ...
## $ Len : int 197 197 189 174 172 186 183 179 179 180 ...
## $ Width : int 72 72 77 71 68 72 69 70 70 70 ...
## $ type : Factor w/ 6 levels "Minivan","Pickup",..: 3 3 5 4 3 3 3 3 3 3 ...
## $ wheeltype : Factor w/ 2 levels "AWD","RWD": 1 2 1 2 2 1 2 2 2 1 ...
# Explore cars with summary()
summary(cars)
## Small.Sporty..Compact.Large.Sedan Sports.Car SUV
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Median :0.0000
## Mean :0.6047 Mean :0.1163 Mean :0.1525
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## Wagon Minivan Pickup AWD
## Min. :0.00000 Min. :0.00000 Min. :0 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0 1st Qu.:0.0000
## Median :0.00000 Median :0.00000 Median :0 Median :0.0000
## Mean :0.07494 Mean :0.05168 Mean :0 Mean :0.2016
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.00000 Max. :0 Max. :1.0000
## RWD Retail.Price Dealer.Cost Engine.Size..l.
## Min. :0.0000 Min. : 10280 Min. : 9875 Min. :1.400
## 1st Qu.:0.0000 1st Qu.: 20997 1st Qu.: 19575 1st Qu.:2.300
## Median :0.0000 Median : 28495 Median : 26155 Median :3.000
## Mean :0.2429 Mean : 33231 Mean : 30441 Mean :3.127
## 3rd Qu.:0.0000 3rd Qu.: 39553 3rd Qu.: 36124 3rd Qu.:3.800
## Max. :1.0000 Max. :192465 Max. :173560 Max. :6.000
## Cyl HP City.MPG Hwy.MPG
## Min. : 3.000 Min. : 73.0 Min. :10.00 Min. :12.00
## 1st Qu.: 4.000 1st Qu.:165.0 1st Qu.:18.00 1st Qu.:24.00
## Median : 6.000 Median :210.0 Median :19.00 Median :27.00
## Mean : 5.757 Mean :214.4 Mean :20.31 Mean :27.26
## 3rd Qu.: 6.000 3rd Qu.:250.0 3rd Qu.:21.50 3rd Qu.:30.00
## Max. :12.000 Max. :493.0 Max. :60.00 Max. :66.00
## Weight Wheel.Base Len Width
## Min. :1850 Min. : 89.0 Min. :143 Min. :64.00
## 1st Qu.:3107 1st Qu.:103.0 1st Qu.:177 1st Qu.:69.00
## Median :3469 Median :107.0 Median :186 Median :71.00
## Mean :3532 Mean :107.2 Mean :185 Mean :71.28
## 3rd Qu.:3922 3rd Qu.:112.0 3rd Qu.:193 3rd Qu.:73.00
## Max. :6400 Max. :130.0 Max. :221 Max. :81.00
## type wheeltype
## Minivan : 20 AWD:183
## Pickup : 0 RWD:204
## Small.Sporty..Compact.Large.Sedan:234
## Sports.Car : 45
## SUV : 59
## Wagon : 29
# Get the correlation matrix with cor()
correl <- cor(cars[,9:19], use = "complete.obs")
# Use ggcorrplot() to explore the correlation matrix
ggcorrplot::ggcorrplot(correl)
# Conduct hierarchical clustering on the correlation matrix
ggcorrplot_clustered <- ggcorrplot::ggcorrplot(correl, hc.order = TRUE, type = "lower")
ggcorrplot_clustered
# Run a PCA for the 10 non-binary numeric variables of cars
pca_output_ten_v <- FactoMineR::PCA(cars[,9:19], ncp = 4, graph = F)
# Get the summary of the first 100 cars
summary(pca_output_ten_v, nbelements = 100)
##
## Call:
## FactoMineR::PCA(X = cars[, 9:19], ncp = 4, graph = F)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 7.105 1.884 0.850 0.357 0.275 0.198 0.141
## % of var. 64.588 17.127 7.725 3.246 2.504 1.799 1.277
## Cumulative % of var. 64.588 81.714 89.439 92.685 95.189 96.988 98.266
## Dim.8 Dim.9 Dim.10 Dim.11
## Variance 0.087 0.066 0.037 0.001
## % of var. 0.788 0.604 0.336 0.007
## Cumulative % of var. 99.053 99.657 99.993 100.000
##
## Individuals (the 100 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## 1 | 1.887 | 1.567 0.089 0.690 | 0.447 0.027 0.056 | 0.287
## 2 | 1.961 | 1.636 0.097 0.696 | 0.340 0.016 0.030 | 0.346
## 3 | 2.535 | 1.907 0.132 0.565 | 0.411 0.023 0.026 | -0.553
## 4 | 4.457 | 1.590 0.092 0.127 | -3.863 2.046 0.751 | 0.357
## 5 | 2.810 | -2.655 0.256 0.892 | -0.654 0.059 0.054 | -0.173
## 6 | 0.865 | 0.441 0.007 0.260 | -0.081 0.001 0.009 | 0.190
## 7 | 1.766 | -1.538 0.086 0.759 | -0.028 0.000 0.000 | -0.098
## 8 | 2.248 | -2.036 0.151 0.820 | 0.062 0.001 0.001 | -0.095
## 9 | 0.803 | -0.410 0.006 0.261 | -0.471 0.030 0.344 | -0.278
## 10 | 1.006 | 0.151 0.001 0.023 | -0.802 0.088 0.636 | -0.103
## 11 | 0.991 | 0.023 0.000 0.001 | -0.569 0.044 0.329 | -0.757
## 12 | 1.027 | -0.023 0.000 0.000 | -0.526 0.038 0.262 | -0.774
## 13 | 1.362 | 0.522 0.010 0.147 | -0.847 0.098 0.387 | -0.504
## 14 | 2.109 | -1.534 0.086 0.529 | -0.232 0.007 0.012 | 0.161
## 15 | 1.347 | 0.906 0.030 0.452 | -0.253 0.009 0.035 | -0.004
## 16 | 0.680 | 0.347 0.004 0.261 | 0.114 0.002 0.028 | 0.275
## 17 | 1.246 | 0.914 0.030 0.539 | 0.018 0.000 0.000 | -0.106
## 18 | 1.097 | 0.809 0.024 0.544 | 0.038 0.000 0.001 | -0.112
## 19 | 2.810 | 2.500 0.227 0.792 | -0.794 0.086 0.080 | 0.194
## 20 | 4.739 | 4.383 0.699 0.856 | -0.393 0.021 0.007 | 1.544
## 21 | 5.893 | 4.914 0.878 0.696 | -2.492 0.852 0.179 | 0.976
## 22 | 3.316 | 2.358 0.202 0.505 | -1.782 0.436 0.289 | -0.864
## 23 | 3.396 | 2.391 0.208 0.496 | -1.763 0.426 0.269 | -1.081
## 24 | 3.290 | -1.964 0.140 0.356 | -1.361 0.254 0.171 | -1.109
## 25 | 3.258 | -1.783 0.116 0.299 | -1.610 0.355 0.244 | -0.950
## 26 | 2.713 | -0.564 0.012 0.043 | -1.844 0.466 0.462 | -0.526
## 27 | 1.332 | -0.963 0.034 0.522 | -0.326 0.015 0.060 | -0.181
## 28 | 1.266 | -0.429 0.007 0.115 | -0.588 0.047 0.216 | -0.366
## 29 | 1.389 | -1.033 0.039 0.553 | -0.242 0.008 0.030 | -0.270
## 30 | 1.280 | -0.703 0.018 0.302 | -0.273 0.010 0.046 | -0.578
## 31 | 1.280 | -0.515 0.010 0.162 | -0.367 0.019 0.082 | -0.638
## 32 | 1.192 | -0.471 0.008 0.156 | -0.794 0.086 0.443 | 0.115
## 33 | 1.270 | 0.069 0.000 0.003 | -1.037 0.147 0.666 | -0.036
## 34 | 1.172 | -0.511 0.010 0.190 | -0.722 0.071 0.380 | 0.076
## 35 | 1.082 | -0.316 0.004 0.085 | -0.760 0.079 0.494 | -0.016
## 36 | 1.526 | 0.394 0.006 0.067 | 0.505 0.035 0.109 | 0.545
## 37 | 1.556 | 0.739 0.020 0.225 | 0.118 0.002 0.006 | 1.050
## 38 | 3.252 | 2.888 0.303 0.789 | -0.787 0.085 0.059 | 0.911
## 39 | 4.384 | 3.998 0.581 0.832 | -0.692 0.066 0.025 | 1.575
## 40 | 4.985 | 4.452 0.721 0.797 | -0.381 0.020 0.006 | 2.008
## 41 | 2.759 | 1.612 0.095 0.342 | -1.893 0.492 0.471 | -0.388
## 42 | 2.368 | 1.159 0.049 0.239 | -1.577 0.341 0.444 | -0.442
## 43 | 1.558 | 1.041 0.039 0.446 | 0.039 0.000 0.001 | -0.834
## 44 | 3.747 | 3.470 0.438 0.857 | -0.723 0.072 0.037 | -0.221
## 45 | 2.544 | -1.513 0.083 0.354 | -1.472 0.297 0.335 | -0.892
## 46 | 2.488 | -1.046 0.040 0.177 | -1.958 0.526 0.619 | -0.440
## 47 | 1.478 | -0.232 0.002 0.025 | 1.218 0.203 0.679 | 0.342
## 48 | 1.797 | 0.713 0.019 0.158 | 1.372 0.258 0.583 | 0.648
## 49 | 1.733 | 0.880 0.028 0.258 | 1.101 0.166 0.404 | 0.795
## 50 | 2.378 | 1.352 0.066 0.323 | 1.424 0.278 0.359 | 1.129
## 51 | 2.509 | 1.884 0.129 0.564 | 1.060 0.154 0.178 | 0.971
## 52 | 2.969 | 2.683 0.262 0.817 | 0.712 0.070 0.058 | -0.609
## 53 | 1.399 | 0.803 0.023 0.329 | 0.731 0.073 0.273 | 0.188
## 54 | 1.464 | 0.265 0.003 0.033 | 1.051 0.152 0.515 | 0.482
## 55 | 1.501 | 0.672 0.016 0.200 | 1.153 0.182 0.590 | -0.227
## 56 | 1.354 | 0.969 0.034 0.513 | 0.436 0.026 0.104 | -0.140
## 57 | 3.348 | 2.993 0.326 0.799 | 0.565 0.044 0.028 | 1.071
## 58 | 3.586 | 3.273 0.390 0.833 | 0.229 0.007 0.004 | 1.224
## 59 | 5.389 | 5.174 0.974 0.922 | 0.728 0.073 0.018 | -0.202
## 60 | 3.174 | 2.921 0.310 0.847 | 0.180 0.004 0.003 | 0.892
## 61 | 3.627 | 3.504 0.447 0.933 | -0.049 0.000 0.000 | 0.080
## 62 | 4.168 | 2.908 0.308 0.487 | -2.810 1.083 0.454 | 0.635
## 63 | 3.640 | 2.434 0.216 0.447 | 1.676 0.385 0.212 | -1.598
## 64 | 4.568 | -4.533 0.747 0.985 | -0.290 0.012 0.004 | -0.113
## 65 | 4.977 | -4.791 0.835 0.927 | -0.771 0.082 0.024 | -0.451
## 66 | 3.415 | -3.208 0.374 0.882 | 0.614 0.052 0.032 | 0.828
## 67 | 3.450 | -3.262 0.387 0.894 | 0.526 0.038 0.023 | 0.798
## 68 | 3.367 | -3.159 0.363 0.881 | 0.528 0.038 0.025 | 0.875
## 69 | 3.799 | 2.462 0.220 0.420 | -1.243 0.212 0.107 | 0.089
## 70 | 3.925 | 2.646 0.255 0.455 | -1.570 0.338 0.160 | 0.267
## 71 | 1.893 | -0.056 0.000 0.001 | 1.519 0.316 0.644 | 0.856
## 72 | 1.718 | 0.428 0.007 0.062 | 1.291 0.229 0.565 | 0.661
## 73 | 1.675 | 0.979 0.035 0.341 | 1.007 0.139 0.361 | 0.346
## 74 | 2.583 | -2.264 0.186 0.768 | 0.882 0.107 0.117 | 0.564
## 75 | 1.320 | -0.689 0.017 0.272 | 0.523 0.038 0.157 | 0.180
## 76 | 1.409 | -0.764 0.021 0.293 | 0.381 0.020 0.073 | 0.574
## 77 | 1.355 | -0.336 0.004 0.061 | 0.826 0.094 0.371 | 0.428
## 78 | 1.826 | -0.157 0.001 0.007 | 1.431 0.281 0.614 | 0.822
## 79 | 1.638 | 0.575 0.012 0.123 | 1.251 0.215 0.584 | 0.179
## 80 | 6.246 | 5.594 1.138 0.802 | 2.600 0.928 0.173 | 0.622
## 81 | 4.990 | 4.655 0.788 0.870 | 1.178 0.190 0.056 | -0.524
## 82 | 3.105 | -1.828 0.122 0.347 | -1.000 0.137 0.104 | -2.017
## 83 | 2.763 | 2.318 0.195 0.704 | 1.002 0.138 0.131 | -0.702
## 84 | 1.087 | 0.353 0.005 0.106 | 0.867 0.103 0.637 | -0.215
## 85 | 2.413 | -1.829 0.122 0.575 | -0.819 0.092 0.115 | -0.901
## 86 | 1.723 | 1.145 0.048 0.442 | 1.002 0.138 0.338 | 0.344
## 87 | 1.737 | 1.291 0.061 0.553 | 0.835 0.096 0.231 | 0.427
## 88 | 2.267 | 0.348 0.004 0.024 | 1.766 0.428 0.607 | 0.896
## 89 | 2.202 | 1.100 0.044 0.249 | 1.526 0.319 0.480 | 0.611
## 90 | 2.782 | -0.841 0.026 0.091 | -1.816 0.452 0.426 | -1.587
## 91 | 3.461 | 2.548 0.236 0.542 | 1.870 0.480 0.292 | -0.059
## 92 | 2.815 | -2.584 0.243 0.842 | -0.170 0.004 0.004 | -0.852
## 93 | 2.716 | -2.475 0.223 0.830 | -0.361 0.018 0.018 | -0.748
## 94 | 2.144 | -1.609 0.094 0.563 | 1.147 0.180 0.286 | 0.135
## 95 | 2.941 | -1.989 0.144 0.457 | 0.219 0.007 0.006 | 0.087
## 96 | 1.129 | -0.406 0.006 0.130 | 0.121 0.002 0.011 | 0.215
## 97 | 1.166 | -0.565 0.012 0.235 | 0.714 0.070 0.375 | 0.062
## 98 | 3.429 | 2.500 0.227 0.531 | 1.845 0.467 0.290 | 0.635
## 99 | 3.266 | 1.643 0.098 0.253 | 2.437 0.815 0.557 | 0.543
## 100 | 4.624 | 4.152 0.627 0.806 | 1.541 0.326 0.111 | -0.340
## ctr cos2
## 1 0.025 0.023 |
## 2 0.036 0.031 |
## 3 0.093 0.048 |
## 4 0.039 0.006 |
## 5 0.009 0.004 |
## 6 0.011 0.048 |
## 7 0.003 0.003 |
## 8 0.003 0.002 |
## 9 0.024 0.120 |
## 10 0.003 0.011 |
## 11 0.174 0.583 |
## 12 0.182 0.567 |
## 13 0.077 0.137 |
## 14 0.008 0.006 |
## 15 0.000 0.000 |
## 16 0.023 0.163 |
## 17 0.003 0.007 |
## 18 0.004 0.010 |
## 19 0.011 0.005 |
## 20 0.725 0.106 |
## 21 0.289 0.027 |
## 22 0.227 0.068 |
## 23 0.355 0.101 |
## 24 0.374 0.114 |
## 25 0.275 0.085 |
## 26 0.084 0.038 |
## 27 0.010 0.018 |
## 28 0.041 0.083 |
## 29 0.022 0.038 |
## 30 0.101 0.204 |
## 31 0.124 0.248 |
## 32 0.004 0.009 |
## 33 0.000 0.001 |
## 34 0.002 0.004 |
## 35 0.000 0.000 |
## 36 0.090 0.127 |
## 37 0.336 0.456 |
## 38 0.252 0.078 |
## 39 0.754 0.129 |
## 40 1.227 0.162 |
## 41 0.046 0.020 |
## 42 0.059 0.035 |
## 43 0.212 0.287 |
## 44 0.015 0.003 |
## 45 0.242 0.123 |
## 46 0.059 0.031 |
## 47 0.036 0.054 |
## 48 0.128 0.130 |
## 49 0.192 0.210 |
## 50 0.388 0.226 |
## 51 0.287 0.150 |
## 52 0.113 0.042 |
## 53 0.011 0.018 |
## 54 0.071 0.108 |
## 55 0.016 0.023 |
## 56 0.006 0.011 |
## 57 0.349 0.102 |
## 58 0.455 0.116 |
## 59 0.012 0.001 |
## 60 0.242 0.079 |
## 61 0.002 0.000 |
## 62 0.123 0.023 |
## 63 0.777 0.193 |
## 64 0.004 0.001 |
## 65 0.062 0.008 |
## 66 0.209 0.059 |
## 67 0.194 0.053 |
## 68 0.233 0.068 |
## 69 0.002 0.001 |
## 70 0.022 0.005 |
## 71 0.223 0.205 |
## 72 0.133 0.148 |
## 73 0.036 0.043 |
## 74 0.097 0.048 |
## 75 0.010 0.019 |
## 76 0.100 0.166 |
## 77 0.056 0.100 |
## 78 0.205 0.203 |
## 79 0.010 0.012 |
## 80 0.117 0.010 |
## 81 0.084 0.011 |
## 82 1.237 0.422 |
## 83 0.150 0.065 |
## 84 0.014 0.039 |
## 85 0.247 0.139 |
## 86 0.036 0.040 |
## 87 0.055 0.060 |
## 88 0.244 0.156 |
## 89 0.114 0.077 |
## 90 0.766 0.325 |
## 91 0.001 0.000 |
## 92 0.221 0.092 |
## 93 0.170 0.076 |
## 94 0.006 0.004 |
## 95 0.002 0.001 |
## 96 0.014 0.036 |
## 97 0.001 0.003 |
## 98 0.123 0.034 |
## 99 0.090 0.028 |
## 100 0.035 0.005 |
##
## Variables
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3 ctr
## Retail.Price | 0.703 6.956 0.494 | -0.643 21.950 0.414 | 0.235 6.501
## Dealer.Cost | 0.699 6.881 0.489 | -0.645 22.104 0.416 | 0.237 6.618
## Engine.Size..l. | 0.925 12.046 0.856 | 0.021 0.024 0.000 | 0.044 0.223
## Cyl | 0.891 11.168 0.793 | -0.107 0.609 0.011 | 0.075 0.663
## HP | 0.849 10.151 0.721 | -0.401 8.539 0.161 | 0.070 0.583
## City.MPG | -0.828 9.640 0.685 | 0.005 0.001 0.000 | 0.493 28.629
## Hwy.MPG | -0.817 9.400 0.668 | 0.015 0.012 0.000 | 0.552 35.880
## Weight | 0.896 11.312 0.804 | 0.230 2.804 0.053 | -0.103 1.259
## Wheel.Base | 0.710 7.087 0.503 | 0.574 17.487 0.329 | 0.244 6.994
## Len | 0.684 6.594 0.468 | 0.561 16.680 0.314 | 0.318 11.882
## Width | 0.789 8.765 0.623 | 0.429 9.790 0.184 | 0.081 0.767
## cos2
## Retail.Price 0.055 |
## Dealer.Cost 0.056 |
## Engine.Size..l. 0.002 |
## Cyl 0.006 |
## HP 0.005 |
## City.MPG 0.243 |
## Hwy.MPG 0.305 |
## Weight 0.011 |
## Wheel.Base 0.059 |
## Len 0.101 |
## Width 0.007 |
# Get the variance of the first 3 new dimensions
pca_output_ten_v$eig[,2][1:3]
## comp 1 comp 2 comp 3
## 64.587622 17.126589 7.724803
# Get the cumulative variance
pca_output_ten_v$eig[,3][1:3]
## comp 1 comp 2 comp 3
## 64.58762 81.71421 89.43901
# Run a PCA with active and supplementary variables
pca_output_all <- FactoMineR::PCA(cars, quanti.sup = 1:8, quali.sup = 20:21, graph = F)
# Get the most correlated variables
FactoMineR::dimdesc(pca_output_all, axes = 1:2)
## $Dim.1
## $quanti
## correlation p.value
## <NA> NA NA
## Engine.Size..l. 0.9251267 5.116179e-164
## Weight 0.8964700 3.638599e-138
## Cyl 0.8907643 6.262131e-134
## HP 0.8492193 8.059693e-109
## Width 0.7891195 1.664116e-83
## Wheel.Base 0.7095703 1.671006e-60
## Retail.Price 0.7030143 5.914584e-59
## Dealer.Cost 0.6991979 4.510056e-58
## Len 0.6844621 8.580649e-55
## SUV 0.3149124 2.343396e-10
## RWD 0.3064581 7.365567e-10
## AWD 0.2192594 1.346164e-05
## Minivan 0.1435461 4.663306e-03
## Small.Sporty..Compact.Large.Sedan -0.3062320 7.590854e-10
## Hwy.MPG -0.8171975 3.651747e-94
## City.MPG -0.8275744 1.404114e-98
##
## $quali
## R2 p.value
## type 0.1505098 8.763997e-13
##
## $category
## Estimate p.value
## type=SUV 1.414274 2.343396e-10
## type=Minivan 1.074161 4.663306e-03
## type=Small.Sporty..Compact.Large.Sedan -1.224868 7.590854e-10
##
## attr(,"class")
## [1] "condes" "list "
##
## $Dim.2
## $quanti
## correlation p.value
## <NA> NA NA
## Wheel.Base 0.5739738 2.730836e-35
## Len 0.5605697 2.095168e-33
## Width 0.4294626 8.445613e-19
## Minivan 0.3206132 1.060475e-10
## Weight 0.2298540 4.913548e-06
## SUV 0.1675328 9.379368e-04
## Small.Sporty..Compact.Large.Sedan 0.1164972 2.189605e-02
## Cyl -0.1071037 3.518485e-02
## RWD -0.3862412 3.231605e-15
## HP -0.4010809 2.175520e-16
## Sports.Car -0.5919253 5.917164e-38
## Retail.Price -0.6430569 1.540011e-46
## Dealer.Cost -0.6453051 5.917390e-47
##
## $quali
## R2 p.value
## type 0.43630221 2.376020e-46
## wheeltype 0.01220005 2.981756e-02
##
## $category
## Estimate p.value
## type=Minivan 1.81590533 1.060475e-10
## type=SUV 0.47299804 9.379368e-04
## type=Small.Sporty..Compact.Large.Sedan 0.06011519 2.189605e-02
## wheeltype=AWD 0.15182838 2.981756e-02
## wheeltype=RWD -0.15182838 2.981756e-02
## type=Sports.Car -2.30896125 5.917164e-38
##
## attr(,"class")
## [1] "condes" "list "
##
## $call
## $call$num.var
## [1] 1
##
## $call$proba
## [1] 0.05
##
## $call$weights
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [334] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## $call$X
## Dim.1 Small.Sporty..Compact.Large.Sedan Sports.Car SUV Wagon Minivan
## 1 1.56744306 1 0 0 0 0
## 2 1.63564834 1 0 0 0 0
## 3 1.90661858 0 0 1 0 0
## 4 1.59019865 0 1 0 0 0
## 5 -2.65476353 1 0 0 0 0
## 6 0.44124255 1 0 0 0 0
## 7 -1.53811110 1 0 0 0 0
## 8 -2.03640834 1 0 0 0 0
## 9 -0.41009326 1 0 0 0 0
## 10 0.15099157 1 0 0 0 0
## 11 0.02281694 1 0 0 0 0
## 12 -0.02261647 1 0 0 0 0
## 13 0.52211427 1 0 0 0 0
## 14 -1.53394298 1 0 0 0 0
## 15 0.90578215 1 0 0 0 0
## 16 0.34728655 1 0 0 0 0
## 17 0.91438864 0 0 0 1 0
## 18 0.80871628 1 0 0 0 0
## 19 2.49964082 1 0 0 0 0
## 20 4.38344059 1 0 0 0 0
## 21 4.91430218 0 1 0 0 0
## 22 2.35761304 0 0 0 1 0
## 23 2.39109462 1 0 0 0 0
## 24 -1.96412664 0 1 0 0 0
## 25 -1.78272147 0 1 0 0 0
## 26 -0.56423976 0 1 0 0 0
## 27 -0.96255180 1 0 0 0 0
## 28 -0.42880611 1 0 0 0 0
## 29 -1.03294129 1 0 0 0 0
## 30 -0.70273231 1 0 0 0 0
## 31 -0.51531036 0 0 0 1 0
## 32 -0.47116351 1 0 0 0 0
## 33 0.06932728 1 0 0 0 0
## 34 -0.51127511 1 0 0 0 0
## 35 -0.31559999 1 0 0 0 0
## 36 0.39449482 1 0 0 0 0
## 37 0.73863903 1 0 0 0 0
## 38 2.88802054 1 0 0 0 0
## 39 3.99822210 1 0 0 0 0
## 40 4.45177167 1 0 0 0 0
## 41 1.61237188 0 1 0 0 0
## 42 1.15868104 0 1 0 0 0
## 43 1.04083449 0 0 1 0 0
## 44 3.47011998 0 0 1 0 0
## 45 -1.51289496 0 1 0 0 0
## 46 -1.04624342 0 1 0 0 0
## 47 -0.23219435 1 0 0 0 0
## 48 0.71348155 1 0 0 0 0
## 49 0.87978111 1 0 0 0 0
## 50 1.35165449 1 0 0 0 0
## 51 1.88367522 1 0 0 0 0
## 52 2.68268633 0 0 1 0 0
## 53 0.80268756 1 0 0 0 0
## 54 0.26488817 1 0 0 0 0
## 55 0.67171174 0 0 1 0 0
## 56 0.96937355 1 0 0 0 0
## 57 2.99266846 1 0 0 0 0
## 58 3.27288686 1 0 0 0 0
## 59 5.17444141 0 0 1 0 0
## 60 2.92086507 1 0 0 0 0
## 61 3.50393084 0 0 1 0 0
## 62 2.90772571 0 1 0 0 0
## 63 2.43427168 0 0 0 0 1
## 64 -4.53334106 1 0 0 0 0
## 65 -4.79149398 1 0 0 0 0
## 66 -3.20754912 1 0 0 0 0
## 67 -3.26199185 1 0 0 0 0
## 68 -3.15942711 1 0 0 0 0
## 69 2.46188769 0 1 0 0 0
## 70 2.64643439 0 1 0 0 0
## 71 -0.05645694 1 0 0 0 0
## 72 0.42777819 1 0 0 0 0
## 73 0.97880986 1 0 0 0 0
## 74 -2.26370423 1 0 0 0 0
## 75 -0.68868268 1 0 0 0 0
## 76 -0.76354383 1 0 0 0 0
## 77 -0.33561356 0 0 0 1 0
## 78 -0.15694485 1 0 0 0 0
## 79 0.57521022 1 0 0 0 0
## 80 5.59372841 0 0 1 0 0
## 81 4.65529650 0 0 1 0 0
## 82 -1.82799619 0 0 1 0 0
## 83 2.31803627 0 0 1 0 0
## 84 0.35338449 0 0 0 0 1
## 85 -1.82948473 1 0 0 0 0
## 86 1.14457520 1 0 0 0 0
## 87 1.29140843 1 0 0 0 0
## 88 0.34765644 1 0 0 0 0
## 89 1.09965378 1 0 0 0 0
## 90 -0.84064314 0 1 0 0 0
## 91 2.54756113 0 0 0 1 0
## 92 -2.58378347 1 0 0 0 0
## 93 -2.47450493 1 0 0 0 0
## 94 -1.60870534 1 0 0 0 0
## 95 -1.98909877 1 0 0 0 0
## 96 -0.40621008 1 0 0 0 0
## 97 -0.56522046 1 0 0 0 0
## 98 2.49975001 0 0 0 0 1
## 99 1.64251636 0 0 0 0 1
## 100 4.15185785 0 0 1 0 0
## 101 -0.01486139 0 0 0 0 1
## 102 3.60500561 0 0 1 0 0
## 103 2.39811223 0 0 0 0 1
## 104 1.08634668 1 0 0 0 0
## 105 0.29654000 1 0 0 0 0
## 106 -3.79056785 1 0 0 0 0
## 107 -3.73260984 1 0 0 0 0
## 108 -1.40969996 1 0 0 0 0
## 109 -1.44431200 1 0 0 0 0
## 110 2.80035990 1 0 0 0 0
## 111 2.87386028 1 0 0 0 0
## 112 3.02029255 1 0 0 0 0
## 113 -0.55668451 0 0 1 0 0
## 114 4.10184280 0 0 1 0 0
## 115 1.80516325 0 0 1 0 0
## 116 -3.95050227 1 0 0 0 0
## 117 -3.59072066 1 0 0 0 0
## 118 -2.67386392 1 0 0 0 0
## 119 -3.29614914 0 0 0 1 0
## 120 -3.64673120 1 0 0 0 0
## 121 -3.54694020 1 0 0 0 0
## 122 2.16632153 0 0 0 0 1
## 123 -0.52155461 0 1 0 0 0
## 124 1.22161854 0 1 0 0 0
## 125 -0.20447919 1 0 0 0 0
## 126 0.04967423 0 0 0 1 0
## 127 0.18258717 1 0 0 0 0
## 128 1.75029340 0 1 0 0 0
## 129 3.68921496 0 0 1 0 0
## 130 1.99056727 0 0 0 0 1
## 131 6.74196916 0 0 1 0 0
## 132 -2.17287603 1 0 0 0 0
## 133 -0.39898994 1 0 0 0 0
## 134 -2.26192485 1 0 0 0 0
## 135 -0.21810209 1 0 0 0 0
## 136 -4.39883279 1 0 0 0 0
## 137 -4.00126616 1 0 0 0 0
## 138 -4.89634024 1 0 0 0 0
## 139 -5.72150410 1 0 0 0 0
## 140 -4.20307600 1 0 0 0 0
## 141 -3.21807286 1 0 0 0 0
## 142 -1.68117329 0 0 1 0 0
## 143 -1.69040962 0 0 1 0 0
## 144 -8.69520656 1 0 0 0 0
## 145 1.92730333 0 0 0 0 1
## 146 1.83464108 0 0 0 0 1
## 147 1.54734354 0 0 1 0 0
## 148 -1.89945869 0 1 0 0 0
## 149 6.75491429 0 0 1 0 0
## 150 -4.69603286 1 0 0 0 0
## 151 -4.64712746 1 0 0 0 0
## 152 -4.62105094 1 0 0 0 0
## 153 -3.35966280 1 0 0 0 0
## 154 -3.31798672 1 0 0 0 0
## 155 -3.28793562 1 0 0 0 0
## 156 -0.65237220 0 0 1 0 0
## 157 -0.66935355 1 0 0 0 0
## 158 -0.64413844 1 0 0 0 0
## 159 -1.47494619 0 1 0 0 0
## 160 0.40173949 1 0 0 0 0
## 161 0.44415839 1 0 0 0 0
## 162 1.98688604 0 0 0 1 0
## 163 3.32079950 0 0 0 1 0
## 164 0.39387058 1 0 0 0 0
## 165 0.66261750 1 0 0 0 0
## 166 0.72738288 1 0 0 0 0
## 167 0.43851186 1 0 0 0 0
## 168 2.60149089 1 0 0 0 0
## 169 3.35546972 1 0 0 0 0
## 170 3.74817848 0 0 1 0 0
## 171 0.03540829 0 0 1 0 0
## 172 1.20275004 1 0 0 0 0
## 173 2.43134693 1 0 0 0 0
## 174 3.57717780 1 0 0 0 0
## 175 3.29940874 1 0 0 0 0
## 176 3.05873910 1 0 0 0 0
## 177 4.24184144 1 0 0 0 0
## 178 2.58526651 0 1 0 0 0
## 179 2.35568386 0 1 0 0 0
## 180 3.65316303 0 1 0 0 0
## 181 3.43502842 0 1 0 0 0
## 182 -0.33428272 1 0 0 0 0
## 183 0.19934972 1 0 0 0 0
## 184 0.78410816 0 0 1 0 0
## 185 -1.21383648 0 0 1 0 0
## 186 -0.83065595 0 0 1 0 0
## 187 -1.83839991 1 0 0 0 0
## 188 -0.74100630 1 0 0 0 0
## 189 -4.32447630 1 0 0 0 0
## 190 -4.48815081 1 0 0 0 0
## 191 -4.42288317 0 0 0 1 0
## 192 1.70626394 0 0 0 0 1
## 193 0.75364786 0 0 1 0 0
## 194 -3.36417731 1 0 0 0 0
## 195 -3.31831056 1 0 0 0 0
## 196 -3.28491939 1 0 0 0 0
## 197 2.75022807 0 0 1 0 0
## 198 -0.48831435 0 0 1 0 0
## 199 5.10153978 0 0 1 0 0
## 200 0.09939308 1 0 0 0 0
## 201 0.69920994 1 0 0 0 0
## 202 2.28554605 1 0 0 0 0
## 203 3.18416200 0 0 1 0 0
## 204 -0.35419141 1 0 0 0 0
## 205 -0.45902286 1 0 0 0 0
## 206 -0.29350246 0 0 0 1 0
## 207 2.88476921 1 0 0 0 0
## 208 4.66460780 0 0 1 0 0
## 209 1.05713434 0 0 1 0 0
## 210 2.34197774 0 1 0 0 0
## 211 4.05206388 0 0 1 0 0
## 212 0.85216204 1 0 0 0 0
## 213 0.96917716 1 0 0 0 0
## 214 2.35734578 1 0 0 0 0
## 215 2.44776656 1 0 0 0 0
## 216 5.88458627 0 0 1 0 0
## 217 3.65064149 1 0 0 0 0
## 218 3.73334990 1 0 0 0 0
## 219 4.27313133 1 0 0 0 0
## 220 0.52502032 0 0 0 0 1
## 221 -3.92353051 0 1 0 0 0
## 222 -3.84806035 0 1 0 0 0
## 223 -1.99785247 0 0 1 0 0
## 224 -2.16309964 1 0 0 0 0
## 225 -1.96674420 1 0 0 0 0
## 226 -0.55569563 0 0 0 1 0
## 227 -0.72721781 1 0 0 0 0
## 228 -0.63569922 1 0 0 0 0
## 229 1.37655823 1 0 0 0 0
## 230 -0.17515935 1 0 0 0 0
## 231 -0.37595019 1 0 0 0 0
## 232 -0.17184686 1 0 0 0 0
## 233 4.53421147 1 0 0 0 0
## 234 8.01434732 1 0 0 0 0
## 235 0.37228122 1 0 0 0 0
## 236 2.33967687 1 0 0 0 0
## 237 1.14053547 0 0 0 1 0
## 238 0.91506168 1 0 0 0 0
## 239 3.31103826 0 0 0 1 0
## 240 3.24006692 1 0 0 0 0
## 241 4.91688280 0 0 1 0 0
## 242 3.58463919 0 0 1 0 0
## 243 3.86180066 1 0 0 0 0
## 244 4.90311072 1 0 0 0 0
## 245 3.55570811 0 1 0 0 0
## 246 5.74723726 0 1 0 0 0
## 247 7.03712661 0 1 0 0 0
## 248 -2.21505312 0 1 0 0 0
## 249 0.38003795 0 1 0 0 0
## 250 2.80795786 1 0 0 0 0
## 251 2.93124193 1 0 0 0 0
## 252 2.96581458 1 0 0 0 0
## 253 3.59284655 1 0 0 0 0
## 254 2.60267189 0 0 0 0 1
## 255 1.65841641 0 0 1 0 0
## 256 0.05373459 0 0 0 1 0
## 257 -0.13341967 1 0 0 0 0
## 258 0.25348619 1 0 0 0 0
## 259 -4.84786750 1 0 0 0 0
## 260 -4.11663112 1 0 0 0 0
## 261 0.36269622 1 0 0 0 0
## 262 -1.02912104 0 1 0 0 0
## 263 -0.95159372 0 1 0 0 0
## 264 1.44849715 0 0 1 0 0
## 265 0.63022064 1 0 0 0 0
## 266 -0.92645852 0 1 0 0 0
## 267 2.15055676 0 0 1 0 0
## 268 -1.91213626 0 0 1 0 0
## 269 -0.09401715 0 1 0 0 0
## 270 0.21806835 0 1 0 0 0
## 271 -1.23594798 1 0 0 0 0
## 272 0.05527955 1 0 0 0 0
## 273 0.58943753 1 0 0 0 0
## 274 0.64315302 1 0 0 0 0
## 275 0.92576969 0 0 0 1 0
## 276 5.06559685 0 0 1 0 0
## 277 0.89121073 0 0 1 0 0
## 278 2.04339105 0 0 0 0 1
## 279 2.44540334 0 0 0 0 1
## 280 -3.93750441 1 0 0 0 0
## 281 -3.85566517 1 0 0 0 0
## 282 -2.59782885 1 0 0 0 0
## 283 -0.03612148 0 0 1 0 0
## 284 -0.68345580 1 0 0 0 0
## 285 -2.26725794 1 0 0 0 0
## 286 1.09219806 0 0 0 0 1
## 287 0.17827352 0 0 1 0 0
## 288 -0.69389643 1 0 0 0 0
## 289 0.40752926 1 0 0 0 0
## 290 0.46182303 1 0 0 0 0
## 291 0.31831349 0 0 0 0 1
## 292 1.59700782 0 0 0 0 1
## 293 -2.88369521 1 0 0 0 0
## 294 -2.82444668 1 0 0 0 0
## 295 -3.60757465 0 0 0 1 0
## 296 1.53737294 0 1 0 0 0
## 297 1.03263332 0 1 0 0 0
## 298 5.15806550 0 1 0 0 0
## 299 0.96204055 0 1 0 0 0
## 300 -1.06794148 0 1 0 0 0
## 301 -0.19503814 0 1 0 0 0
## 302 4.35126082 0 0 1 0 0
## 303 -1.29662124 1 0 0 0 0
## 304 -0.96545657 1 0 0 0 0
## 305 -1.08413906 1 0 0 0 0
## 306 -1.36373803 1 0 0 0 0
## 307 -0.24556100 0 0 0 1 0
## 308 -0.46335286 1 0 0 0 0
## 309 -0.72704609 1 0 0 0 0
## 310 -3.33558483 1 0 0 0 0
## 311 -2.20408469 0 0 0 1 0
## 312 -0.74290239 1 0 0 0 0
## 313 -3.24623059 1 0 0 0 0
## 314 -3.11521846 1 0 0 0 0
## 315 -3.20500661 1 0 0 0 0
## 316 -3.07466667 1 0 0 0 0
## 317 -1.43563960 0 0 1 0 0
## 318 -5.32871274 1 0 0 0 0
## 319 -4.82539717 0 0 0 1 0
## 320 -2.23269022 0 0 0 1 0
## 321 -2.32118846 1 0 0 0 0
## 322 -1.84502371 0 1 0 0 0
## 323 -0.76153929 0 1 0 0 0
## 324 -1.52594330 1 0 0 0 0
## 325 -1.71586770 1 0 0 0 0
## 326 -1.49734306 0 0 0 1 0
## 327 -0.26417696 1 0 0 0 0
## 328 -0.19667226 1 0 0 0 0
## 329 -1.32479117 1 0 0 0 0
## 330 -3.27537823 1 0 0 0 0
## 331 -3.23029477 1 0 0 0 0
## 332 -2.96215869 0 0 0 1 0
## 333 -2.92981106 1 0 0 0 0
## 334 -3.22070146 1 0 0 0 0
## 335 -0.80333934 1 0 0 0 0
## 336 -1.82761487 0 0 1 0 0
## 337 0.05424039 0 0 1 0 0
## 338 1.47609428 0 0 1 0 0
## 339 -0.19545189 1 0 0 0 0
## 340 -0.07408327 1 0 0 0 0
## 341 -1.97377420 1 0 0 0 0
## 342 -0.49954038 1 0 0 0 0
## 343 -1.76221089 1 0 0 0 0
## 344 -0.06442358 1 0 0 0 0
## 345 0.06691974 1 0 0 0 0
## 346 -0.38486659 1 0 0 0 0
## 347 -3.07469590 0 1 0 0 0
## 348 -4.32738711 1 0 0 0 0
## 349 -4.28860219 1 0 0 0 0
## 350 -4.29565702 1 0 0 0 0
## 351 -5.60509985 1 0 0 0 0
## 352 -5.98672380 1 0 0 0 0
## 353 -5.96278064 1 0 0 0 0
## 354 0.59511459 0 0 1 0 0
## 355 4.71921416 0 0 1 0 0
## 356 -3.65416545 0 0 0 1 0
## 357 -4.03041103 0 1 0 0 0
## 358 -6.17792729 1 0 0 0 0
## 359 -2.50038963 0 0 1 0 0
## 360 4.33990152 0 0 1 0 0
## 361 1.52971133 0 0 0 0 1
## 362 1.68811436 0 0 0 0 1
## 363 -3.32630973 1 0 0 0 0
## 364 -2.99967649 1 0 0 0 0
## 365 -3.02854976 0 0 0 1 0
## 366 -1.58538156 1 0 0 0 0
## 367 -4.82323989 1 0 0 0 0
## 368 -3.28732241 1 0 0 0 0
## 369 -3.13925126 1 0 0 0 0
## 370 -1.93461164 0 0 0 1 0
## 371 -1.98809607 1 0 0 0 0
## 372 -0.17608276 1 0 0 0 0
## 373 1.49452661 0 0 0 1 0
## 374 1.43291848 1 0 0 0 0
## 375 2.24760660 0 0 1 0 0
## 376 -0.01331806 1 0 0 0 0
## 377 -0.40574622 1 0 0 0 0
## 378 -2.44078079 1 0 0 0 0
## 379 -0.36091329 1 0 0 0 0
## 380 0.30300253 1 0 0 0 0
## 381 -0.28686597 1 0 0 0 0
## 382 0.03637743 1 0 0 0 0
## 383 0.34800123 1 0 0 0 0
## 384 1.02847595 1 0 0 0 0
## 385 -2.34851831 0 0 0 1 0
## 386 0.06112437 0 0 0 1 0
## 387 2.31125250 0 0 1 0 0
## Pickup AWD RWD Retail.Price Dealer.Cost Engine.Size..l. Cyl HP City.MPG
## 1 0 0 0 43755 39014 3.5 6 225 18
## 2 0 0 0 46100 41100 3.5 6 225 18
## 3 0 1 0 36945 33337 3.5 6 265 17
## 4 0 0 1 89765 79978 3.2 6 290 17
## 5 0 0 0 23820 21761 2.0 4 200 24
## 6 0 0 0 33195 30299 3.2 6 270 20
## 7 0 0 0 26990 24647 2.4 4 200 22
## 8 0 0 0 25940 23508 1.8 4 170 22
## 9 0 0 0 31840 28846 3.0 6 220 20
## 10 0 0 0 42490 38325 3.0 6 220 20
## 11 0 1 0 34480 31388 3.0 6 220 18
## 12 0 1 0 33430 30366 3.0 6 220 17
## 13 0 1 0 44240 40075 3.0 6 220 18
## 14 0 0 0 35940 32506 1.8 4 170 23
## 15 0 1 0 42840 38840 2.7 6 250 18
## 16 0 0 0 36640 33129 3.0 6 220 20
## 17 0 1 0 40840 37060 3.0 6 220 18
## 18 0 1 0 39640 35992 3.0 6 220 18
## 19 0 1 0 49690 44936 4.2 8 300 17
## 20 0 1 0 69190 64740 4.2 8 330 17
## 21 0 0 0 84600 76417 4.2 8 450 15
## 22 0 1 0 49090 44446 4.2 8 340 15
## 23 0 1 0 48040 43556 4.2 8 340 14
## 24 0 0 0 35940 32512 1.8 4 180 20
## 25 0 1 0 37390 33891 1.8 4 225 20
## 26 0 1 0 40590 36739 3.2 6 250 21
## 27 0 0 1 30795 28245 2.5 6 184 20
## 28 0 0 1 37995 34800 2.5 6 184 19
## 29 0 0 1 28495 26155 2.5 6 184 20
## 30 0 1 0 30245 27745 2.5 6 184 19
## 31 0 1 0 32845 30110 2.5 6 184 19
## 32 0 0 1 36995 33890 3.0 6 225 20
## 33 0 0 1 44295 40530 3.0 6 225 19
## 34 0 0 1 35495 32525 3.0 6 225 20
## 35 0 1 0 37245 34115 3.0 6 225 20
## 36 0 0 1 39995 36620 2.5 6 184 19
## 37 0 0 1 44995 41170 3.0 6 225 20
## 38 0 0 1 54995 50270 4.4 8 325 18
## 39 0 0 1 69195 63190 4.4 8 325 18
## 40 0 0 1 73195 66830 4.4 8 325 18
## 41 0 0 1 56595 51815 3.2 6 333 16
## 42 0 0 1 48195 44170 3.2 6 333 16
## 43 0 1 0 37000 33873 3.0 6 225 16
## 44 0 1 0 52195 47720 4.4 8 325 16
## 45 0 0 1 33895 31065 2.5 6 184 20
## 46 0 0 1 41045 37575 3.0 6 225 21
## 47 0 0 0 22180 20351 3.1 6 175 20
## 48 0 0 0 26470 24282 3.8 6 205 20
## 49 0 0 0 32245 29566 3.8 6 205 20
## 50 0 0 0 35545 32244 3.8 6 205 20
## 51 0 0 0 40720 36927 3.8 6 240 18
## 52 0 1 0 37895 34357 4.2 6 275 15
## 53 0 0 0 28345 26047 3.8 6 240 18
## 54 0 0 0 24895 22835 3.8 6 200 20
## 55 0 0 0 26545 24085 3.4 6 185 19
## 56 0 0 1 30835 28575 3.6 6 255 18
## 57 0 0 0 45445 41650 4.6 8 275 18
## 58 0 0 0 50595 46362 4.6 8 300 18
## 59 0 0 0 52795 48377 5.3 8 295 14
## 60 0 0 0 47955 43841 4.6 8 275 18
## 61 0 0 0 46995 43523 4.6 8 320 16
## 62 0 0 1 76200 70546 4.6 8 320 17
## 63 0 1 0 26395 23954 4.3 6 190 14
## 64 0 0 0 11690 10965 1.6 4 103 28
## 65 0 0 0 12585 11802 1.6 4 103 28
## 66 0 0 0 14610 13697 2.2 4 140 26
## 67 0 0 0 14810 13884 2.2 4 140 26
## 68 0 0 0 16385 15357 2.2 4 140 26
## 69 0 0 1 44535 39068 5.7 8 350 18
## 70 0 0 1 51535 45193 5.7 8 350 18
## 71 0 0 0 21900 20095 3.4 6 180 21
## 72 0 0 0 25000 22931 3.8 6 200 20
## 73 0 0 0 27995 25672 3.8 6 240 18
## 74 0 0 0 18995 17434 2.2 4 145 24
## 75 0 0 0 20370 18639 3.5 6 200 22
## 76 0 0 0 23495 21551 3.5 6 200 23
## 77 0 0 0 22225 20394 3.5 6 200 22
## 78 0 0 0 21825 20026 3.4 6 180 21
## 79 0 0 0 24225 22222 3.8 6 200 18
## 80 0 0 0 42735 37422 5.3 8 295 14
## 81 0 1 0 41465 36287 5.3 8 295 14
## 82 0 0 0 20255 19108 2.5 6 165 19
## 83 0 0 0 30295 27479 4.2 6 275 16
## 84 0 0 0 27020 24518 3.4 6 185 19
## 85 0 0 0 25955 24172 2.4 4 220 21
## 86 0 0 0 29865 27797 3.5 6 250 18
## 87 0 0 0 33295 30884 3.5 6 255 18
## 88 0 0 0 24130 22452 2.7 6 200 21
## 89 0 0 0 26860 24909 3.5 6 232 19
## 90 0 0 1 34495 32033 3.2 6 215 17
## 91 0 0 1 31230 28725 3.5 6 250 17
## 92 0 0 0 17985 16919 2.4 4 150 22
## 93 0 0 0 22000 20573 2.4 4 150 22
## 94 0 0 0 19090 17805 2.4 4 150 22
## 95 0 0 0 25215 23451 2.4 4 150 22
## 96 0 0 0 30950 28613 2.7 6 200 21
## 97 0 0 0 21840 20284 2.7 6 200 21
## 98 0 0 0 38380 35063 3.8 6 215 18
## 99 0 0 0 27490 25371 3.3 6 180 19
## 100 0 0 0 35725 31361 4.8 8 285 16
## 101 0 0 0 21795 20508 2.4 4 150 20
## 102 0 1 0 32235 29472 4.7 8 230 15
## 103 0 1 0 32660 29812 3.8 6 215 18
## 104 0 0 0 24885 23058 3.5 6 232 18
## 105 0 0 0 22035 20502 2.7 6 200 21
## 106 0 0 0 13670 12849 2.0 4 132 29
## 107 0 0 0 15040 14086 2.0 4 132 29
## 108 0 0 0 20220 18821 2.4 4 150 21
## 109 0 0 0 18820 17512 2.4 4 150 21
## 110 0 0 1 24345 22856 4.6 8 224 17
## 111 0 0 1 27370 25105 4.6 8 224 17
## 112 0 0 1 30315 27756 4.6 8 239 17
## 113 0 1 0 22515 20907 3.0 6 201 18
## 114 0 0 0 34560 30468 4.6 8 232 15
## 115 0 1 0 29670 26983 4.0 6 210 15
## 116 0 0 0 13730 12906 2.0 4 110 27
## 117 0 0 0 15460 14496 2.0 4 130 26
## 118 0 0 0 19135 17878 2.0 4 170 21
## 119 0 0 0 17475 16375 2.0 4 130 26
## 120 0 0 0 13270 12482 2.0 4 130 26
## 121 0 0 0 15580 14607 2.0 4 130 26
## 122 0 0 0 26930 24498 3.9 6 193 17
## 123 0 0 1 18345 16943 3.8 6 193 20
## 124 0 0 1 29380 26875 4.6 8 260 17
## 125 0 0 0 20320 18881 3.0 6 155 20
## 126 0 0 0 22290 20457 3.0 6 155 19
## 127 0 0 0 22735 20857 3.0 6 201 19
## 128 0 0 0 37530 34483 3.9 8 280 17
## 129 0 0 0 31890 28922 4.2 6 275 15
## 130 0 0 1 25640 23215 4.3 6 190 16
## 131 0 1 0 46265 40534 6.0 8 325 13
## 132 0 0 0 22260 20080 2.4 4 160 26
## 133 0 0 0 26960 24304 3.0 6 240 21
## 134 0 0 0 19860 17924 2.4 4 160 26
## 135 0 0 0 23760 21428 3.0 6 240 21
## 136 0 0 0 13270 12175 1.7 4 115 32
## 137 0 0 0 17750 16265 1.7 4 127 32
## 138 0 0 0 14170 12996 1.7 4 117 36
## 139 0 0 0 20140 18451 1.4 4 93 46
## 140 0 0 0 15850 14531 1.7 4 115 32
## 141 0 0 0 19490 17849 2.0 4 160 26
## 142 0 1 0 19860 18419 2.4 4 160 21
## 143 0 1 0 18690 17334 2.4 4 160 21
## 144 0 0 0 19110 17911 2.0 3 73 60
## 145 0 0 0 27450 24744 3.5 6 240 18
## 146 0 0 0 24950 22498 3.5 6 240 18
## 147 0 1 0 27560 24843 3.5 6 240 17
## 148 0 0 1 33260 29965 2.2 4 240 20
## 149 0 1 0 49995 45815 6.0 8 316 10
## 150 0 0 0 10539 10107 1.6 4 103 29
## 151 0 0 0 11839 11116 1.6 4 103 29
## 152 0 0 0 11939 11209 1.6 4 103 29
## 153 0 0 0 13839 12781 2.0 4 138 26
## 154 0 0 0 15389 14207 2.0 4 138 26
## 155 0 0 0 15389 14207 2.0 4 138 26
## 156 0 0 0 21589 20201 2.7 6 173 20
## 157 0 0 0 19339 17574 2.7 6 170 19
## 158 0 0 0 20339 18380 2.7 6 170 19
## 159 0 0 0 18739 17101 2.7 6 172 19
## 160 0 0 0 24589 22055 3.5 6 194 17
## 161 0 0 0 26189 23486 3.5 6 194 17
## 162 0 0 1 34895 31756 3.5 6 280 16
## 163 0 1 0 36395 33121 4.5 8 315 15
## 164 0 0 1 28495 26157 3.5 6 260 18
## 165 0 1 0 32445 29783 3.5 6 260 18
## 166 0 0 1 29795 27536 3.5 6 280 18
## 167 0 0 0 31145 28320 3.5 6 255 19
## 168 0 0 1 42845 38792 4.5 8 340 17
## 169 0 0 1 52545 47575 4.5 8 340 17
## 170 0 1 0 31849 29977 4.2 6 275 15
## 171 0 0 0 20449 19261 3.2 6 193 17
## 172 0 0 1 43895 40004 3.0 6 235 18
## 173 0 0 1 49995 45556 4.2 8 294 18
## 174 0 0 1 63120 57499 4.2 8 390 17
## 175 0 0 1 68995 62846 4.2 8 294 18
## 176 0 0 1 59995 54656 4.2 8 294 18
## 177 0 0 1 74995 68306 4.2 8 390 17
## 178 0 0 1 74995 68306 4.2 8 294 18
## 179 0 0 1 69995 63756 4.2 8 294 18
## 180 0 0 1 86995 79226 4.2 8 390 16
## 181 0 0 1 81995 74676 4.2 8 390 16
## 182 0 1 0 29995 27355 2.5 6 192 18
## 183 0 1 0 33995 30995 3.0 6 227 18
## 184 0 0 0 27905 25686 4.0 6 195 16
## 185 0 1 0 20130 18973 2.4 4 150 20
## 186 0 1 0 25520 23275 4.0 6 190 16
## 187 0 0 0 16040 14910 2.4 4 138 23
## 188 0 0 0 18435 16850 2.7 6 170 20
## 189 0 0 0 11155 10705 1.6 4 104 25
## 190 0 0 0 10280 9875 1.6 4 104 26
## 191 0 0 0 11905 11410 1.6 4 104 26
## 192 0 0 0 20615 19400 3.5 6 195 16
## 193 0 0 0 19635 18630 3.5 6 192 16
## 194 0 0 0 12360 11630 1.8 4 124 24
## 195 0 0 0 13580 12830 1.8 4 124 24
## 196 0 0 0 14630 13790 1.8 4 124 24
## 197 0 1 0 39250 35777 4.6 8 217 12
## 198 0 1 0 25995 23969 2.5 6 174 18
## 199 0 1 0 72250 65807 4.4 8 282 12
## 200 0 0 0 32350 28755 3.3 6 225 20
## 201 0 0 1 41010 36196 3.0 6 220 18
## 202 0 0 1 48450 42232 4.3 8 300 18
## 203 0 1 0 45700 39838 4.7 8 235 15
## 204 0 0 1 32415 28611 3.0 6 215 18
## 205 0 0 1 31045 27404 3.0 6 215 18
## 206 0 0 1 32455 28647 3.0 6 215 18
## 207 0 0 1 55750 48583 4.3 8 290 18
## 208 0 1 0 64800 56455 4.7 8 235 13
## 209 0 1 0 39195 34576 3.3 6 230 18
## 210 0 0 1 63200 55063 4.3 8 300 18
## 211 0 0 0 42915 39443 4.6 8 302 13
## 212 0 0 1 32495 29969 3.0 6 232 20
## 213 0 0 1 36895 33929 3.0 6 232 20
## 214 0 0 1 40095 36809 3.9 8 280 17
## 215 0 0 1 43495 39869 3.9 8 280 17
## 216 0 1 0 52775 46360 5.4 8 300 13
## 217 0 0 1 41815 38418 4.6 8 239 17
## 218 0 0 1 44925 41217 4.6 8 239 17
## 219 0 0 1 50470 46208 4.6 8 239 17
## 220 0 0 0 28750 26600 3.0 6 200 18
## 221 0 0 1 22388 20701 1.8 4 142 23
## 222 0 0 1 25193 23285 1.8 4 142 23
## 223 0 1 0 21087 19742 2.0 4 130 22
## 224 0 0 0 19270 17817 2.3 4 160 24
## 225 0 0 1 26060 24249 1.8 4 189 22
## 226 0 0 1 33780 31466 2.6 6 168 19
## 227 0 0 1 32280 30071 2.6 6 168 20
## 228 0 1 0 33480 31187 2.6 6 168 19
## 229 0 0 1 52120 48522 3.2 6 349 16
## 230 0 0 1 37630 35046 3.2 6 215 20
## 231 0 0 1 28370 26435 3.2 6 215 19
## 232 0 0 1 35920 33456 3.2 6 215 19
## 233 0 0 1 94820 88324 5.0 8 302 16
## 234 0 0 1 128420 119600 5.5 12 493 13
## 235 0 0 1 45707 41966 3.2 6 215 20
## 236 0 0 1 52800 49104 5.0 8 302 17
## 237 0 0 1 50670 47174 3.2 6 221 19
## 238 0 0 1 48170 44849 3.2 6 221 19
## 239 0 1 0 60670 56474 5.0 8 302 16
## 240 0 0 1 57270 53382 5.0 8 302 16
## 241 0 1 0 76870 71540 5.0 8 292 13
## 242 0 1 0 46470 43268 5.0 8 288 14
## 243 0 0 1 74320 69168 4.3 8 275 18
## 244 0 1 0 86970 80939 5.0 8 302 16
## 245 0 0 1 90520 84325 5.0 8 302 16
## 246 0 0 1 121770 113388 5.5 8 493 14
## 247 0 0 1 126670 117854 5.5 12 493 13
## 248 0 0 1 40320 37548 2.3 4 192 21
## 249 0 0 1 56170 52289 3.2 6 349 17
## 250 0 0 1 24695 23217 4.6 8 224 17
## 251 0 0 1 29595 27148 4.6 8 224 17
## 252 0 0 1 30895 28318 4.6 8 224 17
## 253 0 0 1 34495 31558 4.6 8 302 17
## 254 0 0 0 33995 30846 4.2 6 201 16
## 255 0 0 0 29995 27317 4.0 6 210 16
## 256 0 0 0 22595 20748 3.0 6 155 19
## 257 0 0 0 21595 19848 3.0 6 155 20
## 258 0 0 0 23895 21918 3.0 6 201 19
## 259 0 0 0 16999 15437 1.6 4 115 28
## 260 0 0 0 19999 18137 1.6 4 163 25
## 261 0 0 0 29282 27250 3.5 6 205 18
## 262 0 0 0 25092 23456 3.0 6 210 21
## 263 0 0 0 26992 25218 3.0 6 210 21
## 264 0 1 0 30492 28330 3.8 6 215 17
## 265 0 0 0 25700 23883 3.8 6 230 18
## 266 0 0 0 29562 27466 2.0 4 271 18
## 267 0 1 0 33112 30763 3.8 6 215 15
## 268 0 0 0 18892 17569 2.4 4 160 21
## 269 0 0 1 26910 25203 3.5 6 287 20
## 270 0 0 1 34390 31845 3.5 6 287 20
## 271 0 0 0 19240 18030 2.5 4 175 21
## 272 0 0 0 23290 21580 3.5 6 245 21
## 273 0 0 0 27490 25182 3.5 6 265 20
## 274 0 0 0 29440 26966 3.5 6 265 20
## 275 0 0 1 28739 27300 3.5 6 245 20
## 276 0 0 0 33840 30815 5.6 8 305 13
## 277 0 0 0 27339 25972 3.5 6 240 16
## 278 0 0 0 24780 22958 3.5 6 240 19
## 279 0 0 0 32780 30019 3.5 6 240 18
## 280 0 0 0 12740 12205 1.8 4 126 28
## 281 0 0 0 14740 13747 1.8 4 126 28
## 282 0 0 0 17640 16444 2.5 4 165 23
## 283 0 0 0 20939 19512 3.3 6 180 17
## 284 0 0 0 23675 21485 3.4 6 170 20
## 285 0 0 0 18825 17642 2.2 4 140 24
## 286 0 0 0 28790 26120 3.4 6 185 19
## 287 0 0 0 21595 19810 3.4 6 185 19
## 288 0 0 0 22450 20595 3.4 6 175 20
## 289 0 0 0 22395 20545 3.8 6 200 20
## 290 0 0 0 24295 22284 3.8 6 200 20
## 291 0 0 0 23845 21644 3.4 6 185 19
## 292 0 1 0 31370 28454 3.4 6 185 18
## 293 0 0 0 15495 14375 2.2 4 140 24
## 294 0 0 0 17735 16369 2.2 4 140 24
## 295 0 0 1 17045 15973 1.8 4 130 29
## 296 0 1 0 84165 72206 3.6 6 315 17
## 297 0 0 1 79165 69229 3.6 6 315 18
## 298 0 0 1 192465 173560 3.6 6 477 17
## 299 0 0 1 76765 67128 3.6 6 315 18
## 300 0 0 1 43365 37886 2.7 6 228 20
## 301 0 0 1 52365 45766 3.2 6 258 18
## 302 0 1 0 56665 49865 4.5 8 340 14
## 303 0 0 0 33360 31562 2.0 4 210 20
## 304 0 0 0 43175 40883 2.0 4 210 21
## 305 0 0 0 40670 38520 2.0 4 210 21
## 306 0 0 0 30860 29269 2.0 4 210 20
## 307 0 0 0 40845 38376 2.3 4 250 19
## 308 0 0 0 39465 37721 2.3 4 250 21
## 309 0 0 0 35105 33011 2.3 4 220 21
## 310 0 0 0 10995 10319 2.2 4 140 26
## 311 0 0 0 23560 21779 2.2 4 140 24
## 312 0 0 0 21410 19801 3.0 6 182 20
## 313 0 0 0 14300 13393 2.2 4 140 26
## 314 0 0 0 14850 13904 2.2 4 140 26
## 315 0 0 0 15825 14811 2.2 4 140 26
## 316 0 0 0 16350 15299 2.2 4 140 26
## 317 0 1 0 20585 19238 2.2 4 143 21
## 318 0 0 0 12965 12340 1.5 4 108 32
## 319 0 0 0 14165 13480 1.5 4 108 31
## 320 0 1 0 21445 19646 2.5 4 165 21
## 321 0 1 0 19945 18399 2.5 4 165 22
## 322 0 1 0 25045 23022 2.0 4 227 20
## 323 0 1 0 31545 29130 2.5 4 300 18
## 324 0 1 0 25645 23336 2.5 4 165 21
## 325 0 1 0 20445 18713 2.5 4 165 21
## 326 0 1 0 23895 21773 2.5 4 165 21
## 327 0 1 0 29345 26660 3.0 6 212 19
## 328 0 1 0 31545 28603 3.0 6 212 19
## 329 0 1 0 27145 24687 2.5 4 165 20
## 330 0 0 0 12884 12719 2.3 4 155 25
## 331 0 0 0 14500 14317 2.3 4 155 25
## 332 0 1 0 16497 16291 2.3 4 155 24
## 333 0 0 0 15568 15378 2.0 4 119 22
## 334 0 0 0 12269 12116 2.0 4 119 24
## 335 0 0 0 17262 17053 2.5 6 155 20
## 336 0 1 0 17163 16949 2.5 6 165 19
## 337 0 0 0 23699 22307 2.7 6 185 18
## 338 0 0 0 27710 24801 4.0 6 245 18
## 339 0 0 0 26560 23693 3.0 6 210 21
## 340 0 0 0 30920 27271 3.0 6 210 21
## 341 0 0 0 19560 17558 2.4 4 157 24
## 342 0 0 0 22775 20325 3.0 6 210 21
## 343 0 0 0 19635 17722 2.4 4 157 24
## 344 0 0 0 21965 19819 3.3 6 225 20
## 345 0 0 0 26510 23908 3.3 6 225 20
## 346 0 0 0 25920 23125 3.0 6 210 21
## 347 0 0 0 22570 20363 1.8 4 180 24
## 348 0 0 0 14085 13065 1.8 4 130 32
## 349 0 0 0 15295 13889 1.8 4 130 32
## 350 0 0 0 15030 13650 1.8 4 130 32
## 351 0 0 0 11560 10896 1.5 4 108 33
## 352 0 0 0 10760 10144 1.5 4 108 35
## 353 0 0 0 11290 10642 1.5 4 108 35
## 354 0 1 0 27930 24915 3.3 6 230 18
## 355 0 1 0 54765 47986 4.7 8 325 13
## 356 0 0 0 16695 15156 1.8 4 130 29
## 357 0 0 1 25130 22787 1.8 4 138 26
## 358 0 0 0 20510 18926 1.5 4 110 59
## 359 0 1 0 20290 18553 2.4 4 161 22
## 360 0 1 0 35695 31827 4.7 8 240 14
## 361 0 0 0 23495 21198 3.3 6 230 19
## 362 0 0 0 28800 25690 3.3 6 230 19
## 363 0 0 0 18715 17478 2.0 4 115 24
## 364 0 0 0 19825 18109 1.8 4 180 24
## 365 0 0 0 19005 17427 2.0 4 115 24
## 366 0 0 0 23785 21686 2.8 6 200 21
## 367 0 0 0 21055 19638 1.9 4 100 38
## 368 0 0 0 21055 19638 1.8 4 150 24
## 369 0 0 0 23215 21689 2.0 4 115 24
## 370 0 0 0 24955 22801 1.8 4 170 22
## 371 0 0 0 23955 21898 1.8 4 170 22
## 372 0 0 0 33180 30583 2.8 6 190 19
## 373 0 0 0 40235 36956 4.0 8 270 18
## 374 0 0 0 39235 36052 4.0 8 270 18
## 375 0 1 0 35515 32243 3.2 6 220 15
## 376 0 0 0 42565 40083 2.3 5 242 20
## 377 0 0 0 40565 38203 2.4 5 197 21
## 378 0 0 0 25135 23701 1.9 4 170 22
## 379 0 1 0 31745 29916 2.5 5 208 20
## 380 0 1 0 37560 35382 2.5 5 300 18
## 381 0 0 0 34845 32902 2.3 5 247 20
## 382 0 1 0 37885 35688 2.5 5 194 20
## 383 0 0 0 37730 35542 2.9 6 208 20
## 384 0 0 0 45210 42573 2.9 6 268 19
## 385 0 0 0 26135 24641 1.9 4 170 22
## 386 0 1 0 35145 33112 2.5 5 208 20
## 387 0 1 0 41250 38851 2.9 6 268 15
## Hwy.MPG Weight Wheel.Base Len Width type
## 1 24 3880 115 197 72 type=Small.Sporty..Compact.Large.Sedan
## 2 24 3893 115 197 72 type=Small.Sporty..Compact.Large.Sedan
## 3 23 4451 106 189 77 type=SUV
## 4 24 3153 100 174 71 type=Sports.Car
## 5 31 2778 101 172 68 type=Small.Sporty..Compact.Large.Sedan
## 6 28 3575 108 186 72 type=Small.Sporty..Compact.Large.Sedan
## 7 29 3230 105 183 69 type=Small.Sporty..Compact.Large.Sedan
## 8 31 3252 104 179 70 type=Small.Sporty..Compact.Large.Sedan
## 9 28 3462 104 179 70 type=Small.Sporty..Compact.Large.Sedan
## 10 27 3814 105 180 70 type=Small.Sporty..Compact.Large.Sedan
## 11 25 3627 104 179 70 type=Small.Sporty..Compact.Large.Sedan
## 12 26 3583 104 179 70 type=Small.Sporty..Compact.Large.Sedan
## 13 25 4013 105 180 70 type=Small.Sporty..Compact.Large.Sedan
## 14 30 3638 105 180 70 type=Small.Sporty..Compact.Large.Sedan
## 15 25 3836 109 192 71 type=Small.Sporty..Compact.Large.Sedan
## 16 27 3561 109 192 71 type=Small.Sporty..Compact.Large.Sedan
## 17 25 4035 109 192 71 type=Wagon
## 18 25 3880 109 192 71 type=Small.Sporty..Compact.Large.Sedan
## 19 24 4024 109 193 71 type=Small.Sporty..Compact.Large.Sedan
## 20 24 4399 121 204 75 type=Small.Sporty..Compact.Large.Sedan
## 21 22 4024 109 191 78 type=Sports.Car
## 22 21 3936 104 179 70 type=Wagon
## 23 20 3825 104 179 70 type=Small.Sporty..Compact.Large.Sedan
## 24 28 3131 95 159 73 type=Sports.Car
## 25 28 2921 96 159 73 type=Sports.Car
## 26 29 3351 96 159 73 type=Sports.Car
## 27 29 3197 107 177 69 type=Small.Sporty..Compact.Large.Sedan
## 28 27 3560 107 177 69 type=Small.Sporty..Compact.Large.Sedan
## 29 29 3219 107 176 69 type=Small.Sporty..Compact.Large.Sedan
## 30 27 3461 107 176 69 type=Small.Sporty..Compact.Large.Sedan
## 31 26 3594 107 176 69 type=Wagon
## 32 30 3285 107 176 69 type=Small.Sporty..Compact.Large.Sedan
## 33 28 3616 107 177 69 type=Small.Sporty..Compact.Large.Sedan
## 34 30 3285 107 176 69 type=Small.Sporty..Compact.Large.Sedan
## 35 29 3483 107 176 69 type=Small.Sporty..Compact.Large.Sedan
## 36 28 3428 114 191 73 type=Small.Sporty..Compact.Large.Sedan
## 37 30 3472 114 191 73 type=Small.Sporty..Compact.Large.Sedan
## 38 26 3814 114 191 73 type=Small.Sporty..Compact.Large.Sedan
## 39 26 4376 118 198 75 type=Small.Sporty..Compact.Large.Sedan
## 40 26 4464 123 204 75 type=Small.Sporty..Compact.Large.Sedan
## 41 23 3781 108 177 70 type=Sports.Car
## 42 24 3415 108 177 70 type=Sports.Car
## 43 23 4023 110 180 73 type=SUV
## 44 22 4824 111 184 74 type=SUV
## 45 28 2932 98 161 70 type=Sports.Car
## 46 29 2998 98 161 70 type=Sports.Car
## 47 30 3353 109 195 73 type=Small.Sporty..Compact.Large.Sedan
## 48 29 3567 112 200 74 type=Small.Sporty..Compact.Large.Sedan
## 49 29 3591 112 200 74 type=Small.Sporty..Compact.Large.Sedan
## 50 29 3778 114 207 75 type=Small.Sporty..Compact.Large.Sedan
## 51 28 3909 114 207 75 type=Small.Sporty..Compact.Large.Sedan
## 52 21 4600 113 193 75 type=SUV
## 53 28 3536 109 196 73 type=Small.Sporty..Compact.Large.Sedan
## 54 30 3461 109 196 73 type=Small.Sporty..Compact.Large.Sedan
## 55 26 4024 112 187 74 type=SUV
## 56 25 3694 113 190 71 type=Small.Sporty..Compact.Large.Sedan
## 57 26 3984 115 207 74 type=Small.Sporty..Compact.Large.Sedan
## 58 26 4044 115 207 74 type=Small.Sporty..Compact.Large.Sedan
## 59 18 5367 116 199 79 type=SUV
## 60 26 3992 112 201 75 type=Small.Sporty..Compact.Large.Sedan
## 61 21 4302 116 195 73 type=SUV
## 62 25 3647 106 178 72 type=Sports.Car
## 63 17 4605 111 190 78 type=Minivan
## 64 34 2370 98 167 66 type=Small.Sporty..Compact.Large.Sedan
## 65 34 2348 98 153 66 type=Small.Sporty..Compact.Large.Sedan
## 66 37 2617 104 183 69 type=Small.Sporty..Compact.Large.Sedan
## 67 37 2676 104 183 68 type=Small.Sporty..Compact.Large.Sedan
## 68 37 2617 104 183 69 type=Small.Sporty..Compact.Large.Sedan
## 69 25 3246 105 180 74 type=Sports.Car
## 70 25 3248 105 180 74 type=Sports.Car
## 71 32 3465 111 200 73 type=Small.Sporty..Compact.Large.Sedan
## 72 30 3476 111 200 73 type=Small.Sporty..Compact.Large.Sedan
## 73 28 3606 111 200 73 type=Small.Sporty..Compact.Large.Sedan
## 74 34 3174 106 188 70 type=Small.Sporty..Compact.Large.Sedan
## 75 30 3297 106 188 70 type=Small.Sporty..Compact.Large.Sedan
## 76 32 3315 106 188 70 type=Small.Sporty..Compact.Large.Sedan
## 77 30 3458 112 188 70 type=Wagon
## 78 32 3340 111 198 73 type=Small.Sporty..Compact.Large.Sedan
## 79 28 3434 111 198 73 type=Small.Sporty..Compact.Large.Sedan
## 80 18 4947 130 219 79 type=SUV
## 81 18 5050 116 197 79 type=SUV
## 82 22 2866 98 163 67 type=SUV
## 83 21 4425 113 192 75 type=SUV
## 84 26 3699 112 187 72 type=Minivan
## 85 27 3217 103 169 67 type=Small.Sporty..Compact.Large.Sedan
## 86 27 3581 113 198 74 type=Small.Sporty..Compact.Large.Sedan
## 87 27 3650 113 198 74 type=Small.Sporty..Compact.Large.Sedan
## 88 29 3479 113 208 74 type=Small.Sporty..Compact.Large.Sedan
## 89 27 3548 113 208 74 type=Small.Sporty..Compact.Large.Sedan
## 90 25 3060 95 160 70 type=Sports.Car
## 91 23 4675 116 199 79 type=Wagon
## 92 29 3101 103 169 67 type=Small.Sporty..Compact.Large.Sedan
## 93 29 3105 103 169 67 type=Small.Sporty..Compact.Large.Sedan
## 94 30 3173 108 191 71 type=Small.Sporty..Compact.Large.Sedan
## 95 30 3357 106 194 64 type=Small.Sporty..Compact.Large.Sedan
## 96 28 3448 106 194 69 type=Small.Sporty..Compact.Large.Sedan
## 97 28 3222 108 191 71 type=Small.Sporty..Compact.Large.Sedan
## 98 25 4331 119 201 79 type=Minivan
## 99 26 4068 119 201 79 type=Minivan
## 100 19 5042 116 199 79 type=SUV
## 101 26 3862 113 189 79 type=Minivan
## 102 21 4987 119 201 76 type=SUV
## 103 25 4440 119 201 79 type=Minivan
## 104 27 3487 113 204 75 type=Small.Sporty..Compact.Large.Sedan
## 105 29 3469 113 204 75 type=Small.Sporty..Compact.Large.Sedan
## 106 36 2581 105 174 67 type=Small.Sporty..Compact.Large.Sedan
## 107 36 2626 105 174 67 type=Small.Sporty..Compact.Large.Sedan
## 108 28 3175 108 191 71 type=Small.Sporty..Compact.Large.Sedan
## 109 28 3182 108 191 71 type=Small.Sporty..Compact.Large.Sedan
## 110 25 4057 115 212 78 type=Small.Sporty..Compact.Large.Sedan
## 111 25 4057 115 212 78 type=Small.Sporty..Compact.Large.Sedan
## 112 25 4057 115 212 78 type=Small.Sporty..Compact.Large.Sedan
## 113 23 3346 103 173 70 type=SUV
## 114 19 5000 119 206 79 type=SUV
## 115 20 4463 114 190 72 type=SUV
## 116 36 2606 103 168 67 type=Small.Sporty..Compact.Large.Sedan
## 117 33 2606 103 168 67 type=Small.Sporty..Compact.Large.Sedan
## 118 28 2750 103 168 67 type=Small.Sporty..Compact.Large.Sedan
## 119 33 2702 103 178 67 type=Wagon
## 120 33 2612 103 168 67 type=Small.Sporty..Compact.Large.Sedan
## 121 33 2691 103 168 67 type=Small.Sporty..Compact.Large.Sedan
## 122 23 4275 121 201 77 type=Minivan
## 123 29 3290 101 183 73 type=Sports.Car
## 124 25 3347 101 183 73 type=Sports.Car
## 125 27 3306 109 198 73 type=Small.Sporty..Compact.Large.Sedan
## 126 26 3497 109 198 73 type=Wagon
## 127 26 3313 109 198 73 type=Small.Sporty..Compact.Large.Sedan
## 128 24 3780 107 186 72 type=Sports.Car
## 129 19 4945 129 208 75 type=SUV
## 130 20 4309 111 190 78 type=Minivan
## 131 17 6133 130 219 79 type=SUV
## 132 34 3047 105 188 71 type=Small.Sporty..Compact.Large.Sedan
## 133 30 3294 105 188 71 type=Small.Sporty..Compact.Large.Sedan
## 134 34 2994 105 188 71 type=Small.Sporty..Compact.Large.Sedan
## 135 30 3349 108 190 72 type=Small.Sporty..Compact.Large.Sedan
## 136 38 2432 103 175 67 type=Small.Sporty..Compact.Large.Sedan
## 137 37 2601 103 175 68 type=Small.Sporty..Compact.Large.Sedan
## 138 44 2500 103 175 67 type=Small.Sporty..Compact.Large.Sedan
## 139 51 2732 103 175 68 type=Small.Sporty..Compact.Large.Sedan
## 140 38 2513 103 175 68 type=Small.Sporty..Compact.Large.Sedan
## 141 30 2782 101 166 67 type=Small.Sporty..Compact.Large.Sedan
## 142 25 3258 103 179 70 type=SUV
## 143 24 3468 101 167 72 type=SUV
## 144 66 1850 95 155 67 type=Small.Sporty..Compact.Large.Sedan
## 145 25 4365 118 201 76 type=Minivan
## 146 25 4310 118 201 76 type=Minivan
## 147 22 4387 106 188 77 type=SUV
## 148 25 2835 95 162 69 type=Sports.Car
## 149 12 6400 123 190 81 type=SUV
## 150 33 2255 96 167 66 type=Small.Sporty..Compact.Large.Sedan
## 151 33 2290 96 167 66 type=Small.Sporty..Compact.Large.Sedan
## 152 33 2339 96 167 66 type=Small.Sporty..Compact.Large.Sedan
## 153 34 2635 103 178 68 type=Small.Sporty..Compact.Large.Sedan
## 154 34 2635 103 178 68 type=Small.Sporty..Compact.Large.Sedan
## 155 34 2698 103 178 68 type=Small.Sporty..Compact.Large.Sedan
## 156 26 3549 103 177 73 type=SUV
## 157 27 3217 106 187 72 type=Small.Sporty..Compact.Large.Sedan
## 158 27 3217 106 187 72 type=Small.Sporty..Compact.Large.Sedan
## 159 26 3023 100 173 69 type=Sports.Car
## 160 26 3651 108 192 72 type=Small.Sporty..Compact.Large.Sedan
## 161 26 3651 108 192 72 type=Small.Sporty..Compact.Large.Sedan
## 162 22 4056 112 189 76 type=Wagon
## 163 19 4309 112 189 76 type=Wagon
## 164 26 3336 112 187 69 type=Small.Sporty..Compact.Large.Sedan
## 165 26 3677 112 187 69 type=Small.Sporty..Compact.Large.Sedan
## 166 26 3416 112 182 72 type=Small.Sporty..Compact.Large.Sedan
## 167 26 3306 108 194 70 type=Small.Sporty..Compact.Large.Sedan
## 168 23 3851 110 197 70 type=Small.Sporty..Compact.Large.Sedan
## 169 23 3977 113 200 73 type=Small.Sporty..Compact.Large.Sedan
## 170 20 4967 129 208 76 type=SUV
## 171 21 3836 106 178 70 type=SUV
## 172 26 3777 115 192 72 type=Small.Sporty..Compact.Large.Sedan
## 173 28 3874 115 192 72 type=Small.Sporty..Compact.Large.Sedan
## 174 24 4046 115 192 72 type=Small.Sporty..Compact.Large.Sedan
## 175 28 3803 119 200 73 type=Small.Sporty..Compact.Large.Sedan
## 176 28 3803 119 200 73 type=Small.Sporty..Compact.Large.Sedan
## 177 24 3948 119 200 73 type=Small.Sporty..Compact.Large.Sedan
## 178 26 3980 102 187 71 type=Sports.Car
## 179 26 3779 102 187 71 type=Sports.Car
## 180 23 4042 102 187 71 type=Sports.Car
## 181 23 3865 102 187 71 type=Sports.Car
## 182 26 3428 107 184 70 type=Small.Sporty..Compact.Large.Sedan
## 183 25 3516 107 184 70 type=Small.Sporty..Compact.Large.Sedan
## 184 21 3790 106 181 72 type=SUV
## 185 24 3826 104 174 72 type=SUV
## 186 19 3575 93 150 67 type=SUV
## 187 30 3281 106 186 72 type=Small.Sporty..Compact.Large.Sedan
## 188 27 3279 106 186 72 type=Small.Sporty..Compact.Large.Sedan
## 189 32 2458 95 167 66 type=Small.Sporty..Compact.Large.Sedan
## 190 33 2403 95 167 66 type=Small.Sporty..Compact.Large.Sedan
## 191 33 2447 95 167 66 type=Wagon
## 192 22 4802 115 194 75 type=Minivan
## 193 19 4112 107 180 73 type=SUV
## 194 32 2661 101 178 68 type=Small.Sporty..Compact.Large.Sedan
## 195 32 2686 101 178 68 type=Small.Sporty..Compact.Large.Sedan
## 196 32 2697 101 178 68 type=Small.Sporty..Compact.Large.Sedan
## 197 16 4576 100 185 74 type=SUV
## 198 21 3577 101 175 71 type=SUV
## 199 16 5379 113 195 76 type=SUV
## 200 29 3460 107 191 71 type=Small.Sporty..Compact.Large.Sedan
## 201 25 3649 110 189 71 type=Small.Sporty..Compact.Large.Sedan
## 202 23 3715 110 189 71 type=Small.Sporty..Compact.Large.Sedan
## 203 19 4740 110 188 74 type=SUV
## 204 24 3285 105 177 68 type=Small.Sporty..Compact.Large.Sedan
## 205 25 3255 105 177 68 type=Small.Sporty..Compact.Large.Sedan
## 206 24 3410 105 177 68 type=Wagon
## 207 25 3990 115 197 72 type=Small.Sporty..Compact.Large.Sedan
## 208 17 5590 112 193 76 type=SUV
## 209 24 4065 107 186 73 type=SUV
## 210 23 3840 103 178 72 type=Sports.Car
## 211 18 4834 114 193 76 type=SUV
## 212 26 3681 115 194 73 type=Small.Sporty..Compact.Large.Sedan
## 213 26 3681 115 194 73 type=Small.Sporty..Compact.Large.Sedan
## 214 24 3768 115 194 73 type=Small.Sporty..Compact.Large.Sedan
## 215 24 3768 115 194 73 type=Small.Sporty..Compact.Large.Sedan
## 216 18 5969 119 206 80 type=SUV
## 217 25 4369 118 215 78 type=Small.Sporty..Compact.Large.Sedan
## 218 25 4369 118 215 78 type=Small.Sporty..Compact.Large.Sedan
## 219 25 4474 124 221 78 type=Small.Sporty..Compact.Large.Sedan
## 220 25 3812 112 188 72 type=Minivan
## 221 28 2387 89 156 66 type=Sports.Car
## 222 28 2387 89 156 66 type=Sports.Car
## 223 25 3091 103 173 72 type=SUV
## 224 32 3042 105 187 70 type=Small.Sporty..Compact.Large.Sedan
## 225 30 3250 107 178 68 type=Small.Sporty..Compact.Large.Sedan
## 226 25 3470 107 179 68 type=Wagon
## 227 25 3360 107 178 68 type=Small.Sporty..Compact.Large.Sedan
## 228 25 3360 107 178 68 type=Small.Sporty..Compact.Large.Sedan
## 229 21 3540 107 178 68 type=Small.Sporty..Compact.Large.Sedan
## 230 26 3450 107 178 68 type=Small.Sporty..Compact.Large.Sedan
## 231 26 3430 107 178 68 type=Small.Sporty..Compact.Large.Sedan
## 232 26 3430 107 178 68 type=Small.Sporty..Compact.Large.Sedan
## 233 24 4085 114 196 73 type=Small.Sporty..Compact.Large.Sedan
## 234 19 4473 114 196 73 type=Small.Sporty..Compact.Large.Sedan
## 235 26 3770 107 183 69 type=Small.Sporty..Compact.Large.Sedan
## 236 22 3585 107 183 69 type=Small.Sporty..Compact.Large.Sedan
## 237 27 3966 112 190 71 type=Wagon
## 238 27 3635 112 190 71 type=Small.Sporty..Compact.Large.Sedan
## 239 24 4230 112 190 71 type=Wagon
## 240 20 3815 112 190 71 type=Small.Sporty..Compact.Large.Sedan
## 241 14 5423 112 186 71 type=SUV
## 242 17 4874 111 183 72 type=SUV
## 243 26 4160 122 203 73 type=Small.Sporty..Compact.Large.Sedan
## 244 24 4390 122 203 73 type=Small.Sporty..Compact.Large.Sedan
## 245 23 4065 101 179 72 type=Sports.Car
## 246 21 4235 101 179 72 type=Sports.Car
## 247 19 4429 101 179 72 type=Sports.Car
## 248 29 3055 95 158 68 type=Sports.Car
## 249 22 3220 95 158 68 type=Sports.Car
## 250 25 4052 115 212 78 type=Small.Sporty..Compact.Large.Sedan
## 251 25 4052 115 212 78 type=Small.Sporty..Compact.Large.Sedan
## 252 25 4052 115 212 78 type=Small.Sporty..Compact.Large.Sedan
## 253 23 4195 115 212 78 type=Small.Sporty..Compact.Large.Sedan
## 254 23 4340 121 202 77 type=Minivan
## 255 21 4374 114 190 72 type=SUV
## 256 26 3488 109 198 73 type=Wagon
## 257 27 3308 109 200 73 type=Small.Sporty..Compact.Large.Sedan
## 258 26 3315 109 200 73 type=Small.Sporty..Compact.Large.Sedan
## 259 37 2524 97 143 67 type=Small.Sporty..Compact.Large.Sedan
## 260 34 2678 97 144 67 type=Small.Sporty..Compact.Large.Sedan
## 261 25 3549 107 194 70 type=Small.Sporty..Compact.Large.Sedan
## 262 28 3241 101 177 69 type=Sports.Car
## 263 28 3296 101 177 69 type=Sports.Car
## 264 21 4134 109 190 74 type=SUV
## 265 26 3649 108 191 72 type=Small.Sporty..Compact.Large.Sedan
## 266 26 3263 103 179 70 type=Sports.Car
## 267 19 4718 110 190 75 type=SUV
## 268 27 3240 103 179 69 type=SUV
## 269 26 3188 104 169 72 type=Sports.Car
## 270 26 3428 104 169 72 type=Sports.Car
## 271 26 3039 110 192 70 type=Small.Sporty..Compact.Large.Sedan
## 272 26 3197 110 192 70 type=Small.Sporty..Compact.Large.Sedan
## 273 28 3473 111 194 72 type=Small.Sporty..Compact.Large.Sedan
## 274 28 3476 111 194 72 type=Small.Sporty..Compact.Large.Sedan
## 275 25 3801 111 188 74 type=Wagon
## 276 19 5013 123 207 79 type=SUV
## 277 21 3871 106 183 72 type=SUV
## 278 26 4012 124 204 78 type=Minivan
## 279 25 4175 124 204 78 type=Minivan
## 280 35 2513 100 178 67 type=Small.Sporty..Compact.Large.Sedan
## 281 35 2581 100 178 67 type=Small.Sporty..Compact.Large.Sedan
## 282 28 2761 100 178 67 type=Small.Sporty..Compact.Large.Sedan
## 283 20 3760 104 178 70 type=SUV
## 284 29 3085 107 187 70 type=Small.Sporty..Compact.Large.Sedan
## 285 32 2946 107 187 70 type=Small.Sporty..Compact.Large.Sedan
## 286 26 3948 120 201 72 type=Minivan
## 287 26 3779 108 182 74 type=SUV
## 288 29 3118 107 186 70 type=Small.Sporty..Compact.Large.Sedan
## 289 30 3477 111 198 74 type=Small.Sporty..Compact.Large.Sedan
## 290 30 3484 111 198 74 type=Small.Sporty..Compact.Large.Sedan
## 291 26 3803 112 187 72 type=Minivan
## 292 24 4431 121 201 72 type=Minivan
## 293 33 2771 104 182 68 type=Small.Sporty..Compact.Large.Sedan
## 294 33 2771 104 182 68 type=Small.Sporty..Compact.Large.Sedan
## 295 36 2701 102 172 70 type=Wagon
## 296 24 3240 93 175 72 type=Sports.Car
## 297 26 3135 93 175 70 type=Sports.Car
## 298 24 3131 93 175 72 type=Sports.Car
## 299 26 3119 93 175 70 type=Sports.Car
## 300 29 2811 95 170 70 type=Sports.Car
## 301 26 2911 95 170 70 type=Sports.Car
## 302 18 4950 112 188 76 type=SUV
## 303 28 3175 105 183 69 type=Small.Sporty..Compact.Large.Sedan
## 304 30 3700 105 182 69 type=Small.Sporty..Compact.Large.Sedan
## 305 29 3480 105 182 69 type=Small.Sporty..Compact.Large.Sedan
## 306 28 3175 105 183 69 type=Small.Sporty..Compact.Large.Sedan
## 307 29 3620 106 190 71 type=Wagon
## 308 29 3470 106 190 71 type=Small.Sporty..Compact.Large.Sedan
## 309 29 3470 106 190 71 type=Small.Sporty..Compact.Large.Sedan
## 310 35 2692 103 185 67 type=Small.Sporty..Compact.Large.Sedan
## 311 34 3109 107 190 69 type=Wagon
## 312 28 3197 107 190 69 type=Small.Sporty..Compact.Large.Sedan
## 313 35 2692 103 185 67 type=Small.Sporty..Compact.Large.Sedan
## 314 35 2751 103 185 68 type=Small.Sporty..Compact.Large.Sedan
## 315 35 2692 103 185 67 type=Small.Sporty..Compact.Large.Sedan
## 316 35 2751 103 185 68 type=Small.Sporty..Compact.Large.Sedan
## 317 26 3381 107 181 72 type=SUV
## 318 38 2340 93 154 67 type=Small.Sporty..Compact.Large.Sedan
## 319 35 2425 98 155 67 type=Wagon
## 320 28 3090 99 175 68 type=Wagon
## 321 28 2965 99 174 69 type=Small.Sporty..Compact.Large.Sedan
## 322 27 3085 99 174 69 type=Sports.Car
## 323 24 3263 100 174 69 type=Sports.Car
## 324 28 3395 104 184 69 type=Small.Sporty..Compact.Large.Sedan
## 325 28 3285 104 184 69 type=Small.Sporty..Compact.Large.Sedan
## 326 28 3430 104 187 69 type=Wagon
## 327 26 3610 104 184 69 type=Small.Sporty..Compact.Large.Sedan
## 328 26 3630 104 184 69 type=Small.Sporty..Compact.Large.Sedan
## 329 27 3495 104 184 69 type=Small.Sporty..Compact.Large.Sedan
## 330 31 2676 98 171 68 type=Small.Sporty..Compact.Large.Sedan
## 331 31 2676 98 171 68 type=Small.Sporty..Compact.Large.Sedan
## 332 29 2932 98 167 68 type=Wagon
## 333 30 2756 102 177 68 type=Small.Sporty..Compact.Large.Sedan
## 334 31 2701 102 177 68 type=Small.Sporty..Compact.Large.Sedan
## 335 27 3380 106 188 72 type=Small.Sporty..Compact.Large.Sedan
## 336 22 3020 98 163 67 type=SUV
## 337 22 3682 110 187 70 type=SUV
## 338 21 4035 110 189 74 type=SUV
## 339 29 3417 107 192 72 type=Small.Sporty..Compact.Large.Sedan
## 340 29 3439 107 192 72 type=Small.Sporty..Compact.Large.Sedan
## 341 33 3086 107 189 71 type=Small.Sporty..Compact.Large.Sedan
## 342 29 3296 107 189 71 type=Small.Sporty..Compact.Large.Sedan
## 343 33 3175 107 193 72 type=Small.Sporty..Compact.Large.Sedan
## 344 29 3417 107 193 72 type=Small.Sporty..Compact.Large.Sedan
## 345 29 3439 107 193 72 type=Small.Sporty..Compact.Large.Sedan
## 346 29 3362 107 189 71 type=Small.Sporty..Compact.Large.Sedan
## 347 33 2500 102 171 68 type=Sports.Car
## 348 40 2502 102 178 67 type=Small.Sporty..Compact.Large.Sedan
## 349 40 2524 102 178 67 type=Small.Sporty..Compact.Large.Sedan
## 350 40 2524 102 178 67 type=Small.Sporty..Compact.Large.Sedan
## 351 39 2085 93 163 65 type=Small.Sporty..Compact.Large.Sedan
## 352 43 2035 93 163 65 type=Small.Sporty..Compact.Large.Sedan
## 353 43 2055 93 163 65 type=Small.Sporty..Compact.Large.Sedan
## 354 24 3935 107 185 72 type=SUV
## 355 17 5390 112 193 76 type=SUV
## 356 36 2679 102 171 70 type=Wagon
## 357 32 2195 97 153 67 type=Sports.Car
## 358 51 2890 106 175 68 type=Small.Sporty..Compact.Large.Sedan
## 359 27 3119 98 167 68 type=SUV
## 360 17 5270 118 204 78 type=SUV
## 361 27 4120 119 200 77 type=Minivan
## 362 27 4165 119 200 77 type=Minivan
## 363 31 2897 99 165 68 type=Small.Sporty..Compact.Large.Sedan
## 364 31 2934 99 168 68 type=Small.Sporty..Compact.Large.Sedan
## 365 30 3034 99 174 68 type=Wagon
## 366 30 3179 99 172 68 type=Small.Sporty..Compact.Large.Sedan
## 367 46 3003 99 172 68 type=Small.Sporty..Compact.Large.Sedan
## 368 31 2820 99 161 68 type=Small.Sporty..Compact.Large.Sedan
## 369 30 3082 99 161 68 type=Small.Sporty..Compact.Large.Sedan
## 370 31 3338 106 184 69 type=Wagon
## 371 31 3241 106 185 69 type=Small.Sporty..Compact.Large.Sedan
## 372 26 3721 106 185 69 type=Small.Sporty..Compact.Large.Sedan
## 373 25 4067 106 184 69 type=Wagon
## 374 25 3953 106 185 69 type=Small.Sporty..Compact.Large.Sedan
## 375 20 5086 112 187 76 type=SUV
## 376 26 3450 105 186 72 type=Small.Sporty..Compact.Large.Sedan
## 377 28 3450 105 186 72 type=Small.Sporty..Compact.Large.Sedan
## 378 29 2767 101 178 68 type=Small.Sporty..Compact.Large.Sedan
## 379 27 3903 107 180 71 type=Small.Sporty..Compact.Large.Sedan
## 380 25 3571 107 181 71 type=Small.Sporty..Compact.Large.Sedan
## 381 28 3766 107 180 71 type=Small.Sporty..Compact.Large.Sedan
## 382 27 3691 110 190 72 type=Small.Sporty..Compact.Large.Sedan
## 383 28 3576 110 190 72 type=Small.Sporty..Compact.Large.Sedan
## 384 26 3653 110 190 72 type=Small.Sporty..Compact.Large.Sedan
## 385 29 2822 101 180 68 type=Wagon
## 386 27 3823 109 186 73 type=Wagon
## 387 20 4638 113 189 75 type=SUV
## wheeltype
## 1 wheeltype=AWD
## 2 wheeltype=RWD
## 3 wheeltype=AWD
## 4 wheeltype=RWD
## 5 wheeltype=RWD
## 6 wheeltype=AWD
## 7 wheeltype=RWD
## 8 wheeltype=RWD
## 9 wheeltype=RWD
## 10 wheeltype=AWD
## 11 wheeltype=AWD
## 12 wheeltype=AWD
## 13 wheeltype=AWD
## 14 wheeltype=AWD
## 15 wheeltype=AWD
## 16 wheeltype=RWD
## 17 wheeltype=AWD
## 18 wheeltype=AWD
## 19 wheeltype=AWD
## 20 wheeltype=AWD
## 21 wheeltype=RWD
## 22 wheeltype=AWD
## 23 wheeltype=AWD
## 24 wheeltype=AWD
## 25 wheeltype=AWD
## 26 wheeltype=AWD
## 27 wheeltype=RWD
## 28 wheeltype=RWD
## 29 wheeltype=RWD
## 30 wheeltype=AWD
## 31 wheeltype=AWD
## 32 wheeltype=RWD
## 33 wheeltype=RWD
## 34 wheeltype=RWD
## 35 wheeltype=AWD
## 36 wheeltype=RWD
## 37 wheeltype=RWD
## 38 wheeltype=RWD
## 39 wheeltype=RWD
## 40 wheeltype=RWD
## 41 wheeltype=RWD
## 42 wheeltype=RWD
## 43 wheeltype=AWD
## 44 wheeltype=AWD
## 45 wheeltype=RWD
## 46 wheeltype=RWD
## 47 wheeltype=AWD
## 48 wheeltype=RWD
## 49 wheeltype=RWD
## 50 wheeltype=AWD
## 51 wheeltype=RWD
## 52 wheeltype=AWD
## 53 wheeltype=RWD
## 54 wheeltype=RWD
## 55 wheeltype=RWD
## 56 wheeltype=RWD
## 57 wheeltype=AWD
## 58 wheeltype=AWD
## 59 wheeltype=RWD
## 60 wheeltype=AWD
## 61 wheeltype=AWD
## 62 wheeltype=RWD
## 63 wheeltype=AWD
## 64 wheeltype=AWD
## 65 wheeltype=RWD
## 66 wheeltype=AWD
## 67 wheeltype=AWD
## 68 wheeltype=RWD
## 69 wheeltype=RWD
## 70 wheeltype=RWD
## 71 wheeltype=AWD
## 72 wheeltype=RWD
## 73 wheeltype=RWD
## 74 wheeltype=RWD
## 75 wheeltype=RWD
## 76 wheeltype=AWD
## 77 wheeltype=AWD
## 78 wheeltype=AWD
## 79 wheeltype=RWD
## 80 wheeltype=RWD
## 81 wheeltype=AWD
## 82 wheeltype=AWD
## 83 wheeltype=RWD
## 84 wheeltype=RWD
## 85 wheeltype=AWD
## 86 wheeltype=AWD
## 87 wheeltype=AWD
## 88 wheeltype=RWD
## 89 wheeltype=AWD
## 90 wheeltype=RWD
## 91 wheeltype=RWD
## 92 wheeltype=AWD
## 93 wheeltype=RWD
## 94 wheeltype=RWD
## 95 wheeltype=AWD
## 96 wheeltype=RWD
## 97 wheeltype=AWD
## 98 wheeltype=AWD
## 99 wheeltype=AWD
## 100 wheeltype=RWD
## 101 wheeltype=RWD
## 102 wheeltype=AWD
## 103 wheeltype=AWD
## 104 wheeltype=RWD
## 105 wheeltype=RWD
## 106 wheeltype=AWD
## 107 wheeltype=RWD
## 108 wheeltype=RWD
## 109 wheeltype=AWD
## 110 wheeltype=RWD
## 111 wheeltype=RWD
## 112 wheeltype=RWD
## 113 wheeltype=AWD
## 114 wheeltype=RWD
## 115 wheeltype=AWD
## 116 wheeltype=AWD
## 117 wheeltype=RWD
## 118 wheeltype=AWD
## 119 wheeltype=AWD
## 120 wheeltype=RWD
## 121 wheeltype=RWD
## 122 wheeltype=RWD
## 123 wheeltype=RWD
## 124 wheeltype=RWD
## 125 wheeltype=AWD
## 126 wheeltype=AWD
## 127 wheeltype=AWD
## 128 wheeltype=RWD
## 129 wheeltype=RWD
## 130 wheeltype=RWD
## 131 wheeltype=AWD
## 132 wheeltype=RWD
## 133 wheeltype=RWD
## 134 wheeltype=RWD
## 135 wheeltype=AWD
## 136 wheeltype=RWD
## 137 wheeltype=RWD
## 138 wheeltype=RWD
## 139 wheeltype=AWD
## 140 wheeltype=RWD
## 141 wheeltype=RWD
## 142 wheeltype=AWD
## 143 wheeltype=AWD
## 144 wheeltype=RWD
## 145 wheeltype=RWD
## 146 wheeltype=AWD
## 147 wheeltype=AWD
## 148 wheeltype=RWD
## 149 wheeltype=AWD
## 150 wheeltype=AWD
## 151 wheeltype=RWD
## 152 wheeltype=RWD
## 153 wheeltype=RWD
## 154 wheeltype=RWD
## 155 wheeltype=RWD
## 156 wheeltype=AWD
## 157 wheeltype=AWD
## 158 wheeltype=AWD
## 159 wheeltype=AWD
## 160 wheeltype=AWD
## 161 wheeltype=RWD
## 162 wheeltype=RWD
## 163 wheeltype=AWD
## 164 wheeltype=RWD
## 165 wheeltype=AWD
## 166 wheeltype=RWD
## 167 wheeltype=AWD
## 168 wheeltype=RWD
## 169 wheeltype=RWD
## 170 wheeltype=AWD
## 171 wheeltype=RWD
## 172 wheeltype=RWD
## 173 wheeltype=RWD
## 174 wheeltype=RWD
## 175 wheeltype=RWD
## 176 wheeltype=RWD
## 177 wheeltype=RWD
## 178 wheeltype=RWD
## 179 wheeltype=RWD
## 180 wheeltype=RWD
## 181 wheeltype=RWD
## 182 wheeltype=AWD
## 183 wheeltype=AWD
## 184 wheeltype=RWD
## 185 wheeltype=AWD
## 186 wheeltype=AWD
## 187 wheeltype=AWD
## 188 wheeltype=RWD
## 189 wheeltype=RWD
## 190 wheeltype=AWD
## 191 wheeltype=AWD
## 192 wheeltype=AWD
## 193 wheeltype=AWD
## 194 wheeltype=RWD
## 195 wheeltype=RWD
## 196 wheeltype=AWD
## 197 wheeltype=AWD
## 198 wheeltype=AWD
## 199 wheeltype=AWD
## 200 wheeltype=AWD
## 201 wheeltype=RWD
## 202 wheeltype=RWD
## 203 wheeltype=AWD
## 204 wheeltype=RWD
## 205 wheeltype=RWD
## 206 wheeltype=RWD
## 207 wheeltype=RWD
## 208 wheeltype=AWD
## 209 wheeltype=AWD
## 210 wheeltype=RWD
## 211 wheeltype=RWD
## 212 wheeltype=RWD
## 213 wheeltype=RWD
## 214 wheeltype=RWD
## 215 wheeltype=RWD
## 216 wheeltype=AWD
## 217 wheeltype=RWD
## 218 wheeltype=RWD
## 219 wheeltype=RWD
## 220 wheeltype=AWD
## 221 wheeltype=RWD
## 222 wheeltype=RWD
## 223 wheeltype=AWD
## 224 wheeltype=RWD
## 225 wheeltype=RWD
## 226 wheeltype=RWD
## 227 wheeltype=RWD
## 228 wheeltype=AWD
## 229 wheeltype=RWD
## 230 wheeltype=RWD
## 231 wheeltype=RWD
## 232 wheeltype=RWD
## 233 wheeltype=RWD
## 234 wheeltype=RWD
## 235 wheeltype=RWD
## 236 wheeltype=RWD
## 237 wheeltype=RWD
## 238 wheeltype=RWD
## 239 wheeltype=AWD
## 240 wheeltype=RWD
## 241 wheeltype=AWD
## 242 wheeltype=AWD
## 243 wheeltype=RWD
## 244 wheeltype=AWD
## 245 wheeltype=RWD
## 246 wheeltype=RWD
## 247 wheeltype=RWD
## 248 wheeltype=RWD
## 249 wheeltype=RWD
## 250 wheeltype=RWD
## 251 wheeltype=RWD
## 252 wheeltype=RWD
## 253 wheeltype=RWD
## 254 wheeltype=RWD
## 255 wheeltype=AWD
## 256 wheeltype=RWD
## 257 wheeltype=AWD
## 258 wheeltype=RWD
## 259 wheeltype=AWD
## 260 wheeltype=RWD
## 261 wheeltype=RWD
## 262 wheeltype=RWD
## 263 wheeltype=RWD
## 264 wheeltype=AWD
## 265 wheeltype=AWD
## 266 wheeltype=AWD
## 267 wheeltype=AWD
## 268 wheeltype=AWD
## 269 wheeltype=RWD
## 270 wheeltype=RWD
## 271 wheeltype=RWD
## 272 wheeltype=RWD
## 273 wheeltype=AWD
## 274 wheeltype=RWD
## 275 wheeltype=RWD
## 276 wheeltype=AWD
## 277 wheeltype=AWD
## 278 wheeltype=RWD
## 279 wheeltype=RWD
## 280 wheeltype=AWD
## 281 wheeltype=AWD
## 282 wheeltype=RWD
## 283 wheeltype=AWD
## 284 wheeltype=AWD
## 285 wheeltype=RWD
## 286 wheeltype=RWD
## 287 wheeltype=AWD
## 288 wheeltype=AWD
## 289 wheeltype=RWD
## 290 wheeltype=AWD
## 291 wheeltype=AWD
## 292 wheeltype=AWD
## 293 wheeltype=AWD
## 294 wheeltype=RWD
## 295 wheeltype=RWD
## 296 wheeltype=AWD
## 297 wheeltype=RWD
## 298 wheeltype=RWD
## 299 wheeltype=RWD
## 300 wheeltype=RWD
## 301 wheeltype=RWD
## 302 wheeltype=AWD
## 303 wheeltype=AWD
## 304 wheeltype=AWD
## 305 wheeltype=RWD
## 306 wheeltype=AWD
## 307 wheeltype=AWD
## 308 wheeltype=RWD
## 309 wheeltype=AWD
## 310 wheeltype=AWD
## 311 wheeltype=RWD
## 312 wheeltype=RWD
## 313 wheeltype=RWD
## 314 wheeltype=AWD
## 315 wheeltype=RWD
## 316 wheeltype=AWD
## 317 wheeltype=AWD
## 318 wheeltype=AWD
## 319 wheeltype=RWD
## 320 wheeltype=AWD
## 321 wheeltype=AWD
## 322 wheeltype=AWD
## 323 wheeltype=AWD
## 324 wheeltype=AWD
## 325 wheeltype=AWD
## 326 wheeltype=AWD
## 327 wheeltype=AWD
## 328 wheeltype=AWD
## 329 wheeltype=AWD
## 330 wheeltype=AWD
## 331 wheeltype=RWD
## 332 wheeltype=AWD
## 333 wheeltype=RWD
## 334 wheeltype=AWD
## 335 wheeltype=AWD
## 336 wheeltype=AWD
## 337 wheeltype=RWD
## 338 wheeltype=RWD
## 339 wheeltype=RWD
## 340 wheeltype=AWD
## 341 wheeltype=RWD
## 342 wheeltype=AWD
## 343 wheeltype=RWD
## 344 wheeltype=RWD
## 345 wheeltype=RWD
## 346 wheeltype=RWD
## 347 wheeltype=AWD
## 348 wheeltype=AWD
## 349 wheeltype=AWD
## 350 wheeltype=RWD
## 351 wheeltype=RWD
## 352 wheeltype=AWD
## 353 wheeltype=AWD
## 354 wheeltype=AWD
## 355 wheeltype=AWD
## 356 wheeltype=AWD
## 357 wheeltype=RWD
## 358 wheeltype=RWD
## 359 wheeltype=AWD
## 360 wheeltype=AWD
## 361 wheeltype=RWD
## 362 wheeltype=RWD
## 363 wheeltype=AWD
## 364 wheeltype=AWD
## 365 wheeltype=AWD
## 366 wheeltype=AWD
## 367 wheeltype=RWD
## 368 wheeltype=AWD
## 369 wheeltype=RWD
## 370 wheeltype=RWD
## 371 wheeltype=AWD
## 372 wheeltype=AWD
## 373 wheeltype=AWD
## 374 wheeltype=RWD
## 375 wheeltype=AWD
## 376 wheeltype=AWD
## 377 wheeltype=AWD
## 378 wheeltype=RWD
## 379 wheeltype=AWD
## 380 wheeltype=AWD
## 381 wheeltype=AWD
## 382 wheeltype=AWD
## 383 wheeltype=AWD
## 384 wheeltype=AWD
## 385 wheeltype=RWD
## 386 wheeltype=AWD
## 387 wheeltype=AWD
# Run a PCA on the first 100 car categories
pca_output_hundred <- FactoMineR::PCA(cars, quanti.sup = 1:8, quali.sup = 20:21,
ind.sup = 1:100, graph = F
)
# Trace variable contributions in pca_output_hundred
pca_output_hundred$var$contrib
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Retail.Price 6.699758 2.301732e+01 5.6092132 6.73645561 6.685871e-01
## Dealer.Cost 6.636507 2.316262e+01 5.6749619 7.12398875 5.247519e-01
## Engine.Size..l. 12.207533 3.398443e-05 0.1690355 25.15549588 1.161249e-01
## Cyl 11.092661 5.960251e-01 0.6920445 44.70555192 3.208953e+00
## HP 10.006863 8.872771e+00 0.3887040 0.05144773 3.440815e+00
## City.MPG 9.535081 2.019316e-02 30.3660045 4.74002551 1.019129e+01
## Hwy.MPG 9.456488 8.985550e-03 35.8045505 1.36652382 1.798497e-04
## Weight 11.373558 2.643370e+00 0.7304973 0.10317032 2.668755e+01
## Wheel.Base 7.031113 1.701242e+01 7.7390269 4.84277075 2.178306e+00
## Len 7.107357 1.448978e+01 11.4900264 4.36687777 2.383013e+01
## Width 8.853081 1.017648e+01 1.3359353 0.80769192 2.915331e+01
# Run a PCA using the 10 non-binary numeric variables
cars_pca <- ade4::dudi.pca(cars[,9:19], scannf = FALSE, nf = 4)
# Explore the summary of cars_pca
summary(cars_pca)
## Class: pca dudi
## Call: ade4::dudi.pca(df = cars[, 9:19], scannf = FALSE, nf = 4)
##
## Total inertia: 11
##
## Eigenvalues:
## Ax1 Ax2 Ax3 Ax4 Ax5
## 7.1046 1.8839 0.8497 0.3570 0.2754
##
## Projected inertia (%):
## Ax1 Ax2 Ax3 Ax4 Ax5
## 64.588 17.127 7.725 3.246 2.504
##
## Cumulative projected inertia (%):
## Ax1 Ax1:2 Ax1:3 Ax1:4 Ax1:5
## 64.59 81.71 89.44 92.68 95.19
##
## (Only 5 dimensions (out of 11) are shown)
# Explore the summary of pca_output_ten_v
summary(pca_output_ten_v)
##
## Call:
## FactoMineR::PCA(X = cars[, 9:19], ncp = 4, graph = F)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 7.105 1.884 0.850 0.357 0.275 0.198 0.141
## % of var. 64.588 17.127 7.725 3.246 2.504 1.799 1.277
## Cumulative % of var. 64.588 81.714 89.439 92.685 95.189 96.988 98.266
## Dim.8 Dim.9 Dim.10 Dim.11
## Variance 0.087 0.066 0.037 0.001
## % of var. 0.788 0.604 0.336 0.007
## Cumulative % of var. 99.053 99.657 99.993 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## 1 | 1.887 | 1.567 0.089 0.690 | 0.447 0.027 0.056 | 0.287
## 2 | 1.961 | 1.636 0.097 0.696 | 0.340 0.016 0.030 | 0.346
## 3 | 2.535 | 1.907 0.132 0.565 | 0.411 0.023 0.026 | -0.553
## 4 | 4.457 | 1.590 0.092 0.127 | -3.863 2.046 0.751 | 0.357
## 5 | 2.810 | -2.655 0.256 0.892 | -0.654 0.059 0.054 | -0.173
## 6 | 0.865 | 0.441 0.007 0.260 | -0.081 0.001 0.009 | 0.190
## 7 | 1.766 | -1.538 0.086 0.759 | -0.028 0.000 0.000 | -0.098
## 8 | 2.248 | -2.036 0.151 0.820 | 0.062 0.001 0.001 | -0.095
## 9 | 0.803 | -0.410 0.006 0.261 | -0.471 0.030 0.344 | -0.278
## 10 | 1.006 | 0.151 0.001 0.023 | -0.802 0.088 0.636 | -0.103
## ctr cos2
## 1 0.025 0.023 |
## 2 0.036 0.031 |
## 3 0.093 0.048 |
## 4 0.039 0.006 |
## 5 0.009 0.004 |
## 6 0.011 0.048 |
## 7 0.003 0.003 |
## 8 0.003 0.002 |
## 9 0.024 0.120 |
## 10 0.003 0.011 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3 ctr
## Retail.Price | 0.703 6.956 0.494 | -0.643 21.950 0.414 | 0.235 6.501
## Dealer.Cost | 0.699 6.881 0.489 | -0.645 22.104 0.416 | 0.237 6.618
## Engine.Size..l. | 0.925 12.046 0.856 | 0.021 0.024 0.000 | 0.044 0.223
## Cyl | 0.891 11.168 0.793 | -0.107 0.609 0.011 | 0.075 0.663
## HP | 0.849 10.151 0.721 | -0.401 8.539 0.161 | 0.070 0.583
## City.MPG | -0.828 9.640 0.685 | 0.005 0.001 0.000 | 0.493 28.629
## Hwy.MPG | -0.817 9.400 0.668 | 0.015 0.012 0.000 | 0.552 35.880
## Weight | 0.896 11.312 0.804 | 0.230 2.804 0.053 | -0.103 1.259
## Wheel.Base | 0.710 7.087 0.503 | 0.574 17.487 0.329 | 0.244 6.994
## Len | 0.684 6.594 0.468 | 0.561 16.680 0.314 | 0.318 11.882
## cos2
## Retail.Price 0.055 |
## Dealer.Cost 0.056 |
## Engine.Size..l. 0.002 |
## Cyl 0.006 |
## HP 0.005 |
## City.MPG 0.243 |
## Hwy.MPG 0.305 |
## Weight 0.011 |
## Wheel.Base 0.059 |
## Len 0.101 |
# Create a factor map for the variables
factoextra::fviz_pca_var(pca_output_all, select.var = list(cos2 = 0.7), repel = TRUE)
# Modify the code to create a factor map for the individuals
factoextra::fviz_pca_ind(pca_output_all, select.ind = list(cos2 = 0.7), repel = TRUE)
# Create a barplot for the variables with the highest cos2 in the 1st PC
factoextra::fviz_cos2(pca_output_all, choice = "var", axes = 1, top = 10)
# Create a barplot for the variables with the highest cos2 in the 2nd PC
factoextra::fviz_cos2(pca_output_all, choice = "var", axes = 2, top = 10)
# Create a factor map for the top 5 variables with the highest contributions
factoextra::fviz_pca_var(pca_output_all, select.var = list(contrib = 5), repel = TRUE)
# Create a factor map for the top 5 individuals with the highest contributions
factoextra::fviz_pca_ind(pca_output_all, select.ind = list(contrib = 5), repel = TRUE)
# Create a barplot for the variables with the highest contributions to the 1st PC
factoextra::fviz_contrib(pca_output_all, choice = "var", axes = 1, top = 5)
# Create a barplot for the variables with the highest contributions to the 2nd PC
factoextra::fviz_contrib(pca_output_all, choice = "var", axes = 2, top = 5)
# Create a biplot with no labels for all individuals with the geom argument.
factoextra::fviz_pca_biplot(pca_output_all)
# Create ellipsoids for wheeltype columns respectively.
factoextra::fviz_pca_ind(pca_output_all, habillage = cars$wheeltype, addEllipses = TRUE)
# Create the biplot with ellipsoids
factoextra::fviz_pca_biplot(pca_output_all, habillage=cars$wheeltype, addEllipses=TRUE, alpha.var="cos2")
Chapter 2 - Advanced PCA and Non-Negative Matrix Factorization (NNMF)
Determining the right number of PCs:
Performing PCA on datasets with missing values:
NNMF and Topic Detection with nmf():
Example code includes:
data(airquality, package="datasets")
airquality <- airquality[complete.cases(airquality), ]
# Conduct a PCA on the airquality dataset
pca_air <- FactoMineR::PCA(airquality)
# Apply the Kaiser-Guttman rule
summary(pca_air, ncp = 4)
##
## Call:
## FactoMineR::PCA(X = airquality)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6
## Variance 2.469 1.113 0.998 0.768 0.425 0.227
## % of var. 41.147 18.552 16.640 12.804 7.078 3.778
## Cumulative % of var. 41.147 59.699 76.339 89.144 96.222 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3 ctr
## 1 | 2.659 | -0.566 0.117 0.045 | -1.463 1.732 0.303 | -1.370 1.693
## 2 | 2.474 | -0.664 0.161 0.072 | -0.859 0.598 0.121 | -1.365 1.682
## 3 | 2.494 | -1.488 0.808 0.356 | -1.029 0.857 0.170 | -1.424 1.830
## 4 | 3.110 | -1.454 0.772 0.219 | -2.327 4.383 0.560 | -1.092 1.077
## 7 | 2.688 | -0.866 0.273 0.104 | -2.101 3.572 0.611 | -0.710 0.455
## 8 | 3.101 | -2.607 2.480 0.707 | -0.780 0.493 0.063 | -0.812 0.595
## 9 | 4.323 | -3.780 5.215 0.765 | -0.236 0.045 0.003 | -0.919 0.761
## 12 | 2.142 | -1.067 0.415 0.248 | -1.640 2.178 0.586 | -0.248 0.055
## 13 | 2.493 | -1.164 0.494 0.218 | -1.867 2.822 0.561 | -0.111 0.011
## 14 | 2.271 | -1.283 0.601 0.319 | -1.757 2.498 0.598 | -0.034 0.001
## cos2 Dim.4 ctr cos2
## 1 0.265 | -1.493 2.615 0.316 |
## 2 0.305 | -1.752 3.600 0.502 |
## 3 0.326 | -0.647 0.491 0.067 |
## 4 0.123 | 0.045 0.002 0.000 |
## 7 0.070 | -0.446 0.233 0.028 |
## 8 0.069 | -0.999 1.171 0.104 |
## 9 0.045 | -0.429 0.216 0.010 |
## 12 0.013 | -0.392 0.180 0.033 |
## 13 0.002 | -0.253 0.075 0.010 |
## 14 0.000 | -0.065 0.005 0.001 |
##
## Variables
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3 ctr cos2
## Ozone | 0.875 30.997 0.765 | -0.152 2.062 0.023 | 0.149 2.220 0.022 |
## Solar.R | 0.432 7.569 0.187 | -0.693 43.144 0.480 | 0.064 0.408 0.004 |
## Wind | -0.756 23.171 0.572 | -0.073 0.475 0.005 | -0.058 0.336 0.003 |
## Temp | 0.877 31.155 0.769 | 0.105 0.997 0.011 | -0.047 0.217 0.002 |
## Month | 0.405 6.654 0.164 | 0.749 50.438 0.561 | -0.143 2.035 0.020 |
## Day | -0.106 0.454 0.011 | 0.179 2.884 0.032 | 0.973 94.784 0.946 |
## Dim.4 ctr cos2
## Ozone -0.166 3.596 0.028 |
## Solar.R 0.521 35.323 0.271 |
## Wind 0.487 30.824 0.237 |
## Temp 0.130 2.189 0.017 |
## Month 0.455 26.940 0.207 |
## Day 0.093 1.128 0.009 |
# Perform the screeplot test
factoextra::fviz_screeplot(pca_air, ncp = 5)
data(airquality, package="datasets")
# Conduct a parallel analysis with paran().
air_paran <- paran::paran(airquality[complete.cases(airquality), ])
##
## Using eigendecomposition of correlation matrix.
## Computing: 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
##
##
## Results of Horn's Parallel Analysis for component retention
## 180 iterations, using the mean estimate
##
## --------------------------------------------------
## Component Adjusted Unadjusted Estimated
## Eigenvalue Eigenvalue Bias
## --------------------------------------------------
## 1 2.144440 2.468840 0.324399
## --------------------------------------------------
##
## Adjusted eigenvalues > 1 indicate dimensions to retain.
## (1 components retained)
# Check out air_paran's suggested number of PCs to retain.
air_paran$Retained
## [1] 1
# Conduct a parallel analysis.
air_fa_parallel <- psych::fa.parallel(airquality)
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Parallel analysis suggests that the number of factors = 3 and the number of components = 1
# Check out air_fa_parallel's suggested number of PCs to retain.
air_fa_parallel$ncomp
## [1] 1
# Check out the summary of airquality
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
# Check out the number of cells with missing values.
sum(is.na(airquality))
## [1] 44
# Check out the number of rows with missing values.
nrow(airquality[!complete.cases(airquality), ])
## [1] 42
# Estimate the optimal number of dimensions for imputation.
missMDA::estim_ncpPCA(airquality, ncp.max=5)
## $ncp
## [1] 0
##
## $criterion
## 0 1 2 3 4 5
## 1520.506 1823.946 1771.702 2774.323 2888.306 6369.592
bbc_res <- readRDS("./RInputFiles/bbc_res.rds")
# Get a 5-rank approximation of corpus_tdm.
# bbc_res <- NMF::nmf(corpus_tdm, 5)
# Get the term-topic matrix W.
W <- NMF::basis(bbc_res)
# Check out the dimensions of W.
dim(W)
## [1] 3137 5
# Normalize W.
normal <- function(x) { x / sum(x) }
normal_W <- apply(W, 2, FUN=normal)
# Get the topic-text matrix H.
H <- coef(bbc_res)
# Check out the dimensions of H.
dim(H)
## [1] 5 50
# Normalize H.
normal_H <- apply(H, 2, FUN=normal)
# Explore the nmf's algorithms.
alg <- NMF::nmfAlgorithm()
# Choose the algorithms implemented in R.
R_alg <- NMF::nmfAlgorithm(version="R")
# Use the two-version algorithms.
# bbc_double_opt <- NMF::nmf(x=corpus_tdm, rank=5, method=R_alg, .options="v")
Chapter 3 - Exploratory Factor Analysis
Intro to EFA:
Intro to EFA: Data Factorability:
Extraction methods:
Choosing the right number of factors:
Example code includes:
hsq <- readr::read_delim("./RInputFiles/humor_dataset.csv", delim=";")
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
str(hsq, give.attr=FALSE)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1071 obs. of 39 variables:
## $ Q1 : num 2 2 3 3 1 3 4 2 2 4 ...
## $ Q2 : num 2 3 4 3 4 3 1 4 2 2 ...
## $ Q3 : num 3 2 3 3 2 3 2 4 1 4 ...
## $ Q4 : num 1 2 3 4 2 2 4 1 1 1 ...
## $ Q5 : num 4 4 4 3 3 3 2 5 3 3 ...
## $ Q6 : num 5 4 4 5 5 3 3 5 4 5 ...
## $ Q7 : num 4 4 3 4 4 4 3 4 3 4 ...
## $ Q8 : num 3 3 1 3 1 2 3 3 1 2 ...
## $ Q9 : num 4 4 2 -1 4 2 4 2 3 3 ...
## $ Q10 : num 3 3 4 4 4 1 4 4 3 1 ...
## $ Q11 : num 3 4 3 2 2 3 4 3 2 5 ...
## $ Q12 : num 1 3 2 4 2 3 1 3 2 3 ...
## $ Q13 : num 5 3 4 4 5 4 2 5 5 1 ...
## $ Q14 : num 4 4 4 5 4 4 1 4 3 3 ...
## $ Q15 : num 4 5 3 4 4 4 2 3 3 1 ...
## $ Q16 : num 4 4 3 3 4 3 4 3 4 5 ...
## $ Q17 : num 2 2 2 3 2 2 4 3 2 5 ...
## $ Q18 : num 3 2 4 3 3 1 1 4 2 1 ...
## $ Q19 : num 3 3 2 3 2 4 3 5 4 3 ...
## $ Q20 : num 1 2 1 3 1 2 1 3 1 1 ...
## $ Q21 : num 4 3 4 4 5 4 3 4 4 2 ...
## $ Q22 : num 4 3 2 3 3 4 2 3 4 1 ...
## $ Q23 : num 3 4 4 2 3 4 2 3 4 5 ...
## $ Q24 : num 2 2 3 4 1 2 3 1 2 2 ...
## $ Q25 : num 1 2 2 2 1 2 4 1 1 4 ...
## $ Q26 : num 3 5 4 4 5 3 3 4 3 5 ...
## $ Q27 : num 2 1 3 2 2 2 2 2 4 5 ...
## $ Q28 : num 4 2 3 2 3 4 2 4 4 2 ...
## $ Q29 : num 2 4 2 4 2 3 3 1 1 1 ...
## $ Q30 : num 4 4 5 5 5 4 3 5 5 5 ...
## $ Q31 : num 2 3 4 3 4 3 4 2 2 3 ...
## $ Q32 : num 2 1 2 3 2 3 4 2 1 2 ...
## $ affiliative : num 4 3.3 3.9 3.6 4.1 3.6 2.3 4.4 4.1 2.4 ...
## $ selfenhancing: num 3.5 3.5 3.9 4 4.1 2.9 2.3 4.1 3.3 2.9 ...
## $ agressive : num 3 3.3 3.1 2.9 2.9 3.4 2.8 3.3 2.9 3.8 ...
## $ selfdefeating: num 2.3 2.4 2.3 3.3 2 2.6 2.8 2.5 2 2.3 ...
## $ age : num 25 44 50 30 52 30 27 34 30 18 ...
## $ gender : num 2 2 1 2 1 2 1 1 2 1 ...
## $ accuracy : num 100 90 75 85 80 60 60 88 95 85 ...
# Check out the dimensionality of hsq.
dim(hsq)
## [1] 1071 39
# Explore the correlation object hsq_correl.
hsq_correl <- psych::mixedCor(hsq, c=NULL, p=1:32)
## Warning in matpLower(x, nvar, gminx, gmaxx, gminy, gmaxy): 496 cells were
## adjusted for 0 values using the correction for continuity. Examine your
## data carefully.
str(hsq_correl)
## List of 6
## $ rho : num [1:32, 1:32] 1 -0.2094 -0.1772 -0.0945 -0.4466 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## .. ..$ : chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## $ rx : NULL
## $ poly :List of 4
## ..$ rho : num [1:32, 1:32] 1 -0.2094 -0.1772 -0.0945 -0.4466 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## .. .. ..$ : chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## ..$ tau : num [1:32, 1:6] -2.77 -2.77 -2.9 -3.11 -2.9 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## .. .. ..$ : chr [1:6] "1" "2" "3" "4" ...
## ..$ n.obs: int 1071
## ..$ Call : language polychoric(x = data[, p], smooth = smooth, global = global, weight = weight, correct = correct)
## ..- attr(*, "class")= chr [1:2] "psych" "poly"
## $ tetra:List of 2
## ..$ rho: NULL
## ..$ tau: NULL
## $ rpd : NULL
## $ Call : language psych::mixedCor(data = hsq, c = NULL, p = 1:32)
## - attr(*, "class")= chr [1:2] "psych" "mixed"
# Getting the correlation matrix of the dataset.
hsq_polychoric <- hsq_correl$poly$rho
# Explore the correlation structure of the dataset.
ggcorrplot::ggcorrplot(hsq_polychoric)
# Apply the Bartlett test on the correlation matrix.
psych::cortest.bartlett(hsq_polychoric)
## Warning in psych::cortest.bartlett(hsq_polychoric): n not specified, 100
## used
## $chisq
## [1] 1114.409
##
## $p.value
## [1] 1.610583e-49
##
## $df
## [1] 496
# Check the KMO index.
psych::KMO(hsq_polychoric)
## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = hsq_polychoric)
## Overall MSA = 0.87
## MSA for each item =
## Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11 Q12 Q13 Q14 Q15
## 0.94 0.93 0.91 0.90 0.91 0.88 0.82 0.86 0.95 0.86 0.78 0.90 0.85 0.93 0.82
## Q16 Q17 Q18 Q19 Q20 Q21 Q22 Q23 Q24 Q25 Q26 Q27 Q28 Q29 Q30
## 0.85 0.87 0.83 0.89 0.83 0.87 0.84 0.81 0.84 0.83 0.89 0.83 0.93 0.87 0.81
## Q31 Q32
## 0.81 0.91
# EFA with four factors.
f_hsq <- psych::fa(hsq_polychoric, nfactors=4)
# Inspect the resulting EFA object.
str(f_hsq)
## List of 44
## $ residual : num [1:32, 1:32] 0.5202 0.01625 0.02522 -0.00162 0.00493 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## .. ..$ : chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## $ dof : num 374
## $ ENull : num NA
## $ chi : num NA
## $ rms : num 0.0411
## $ nh : logi NA
## $ EPVAL : num NA
## $ crms : num 0.0473
## $ EBIC : num NA
## $ ESABIC : num NA
## $ fit : num 0.849
## $ fit.off : num 0.969
## $ sd : num 0.0397
## $ factors : num 4
## $ complexity : Named num [1:32] 1.02 1.05 1.29 1 1.08 ...
## ..- attr(*, "names")= chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## $ n.obs : logi NA
## $ PVAL : logi NA
## $ objective : num 2.19
## $ criteria : Named num [1:3] 2.19 NA NA
## ..- attr(*, "names")= chr [1:3] "objective" "" ""
## $ Call : language psych::fa(r = hsq_polychoric, nfactors = 4)
## $ null.model : num 12.7
## $ null.dof : num 496
## $ r.scores : num [1:4, 1:4] 1 -0.166 -0.434 0.212 -0.166 ...
## $ R2 : num [1:4] 0.886 0.867 0.866 0.821
## $ valid : num [1:4] 0.93 0.915 0.902 0.889
## $ score.cor : num [1:4, 1:4] 1 -0.198 -0.44 0.229 -0.198 ...
## $ weights : num [1:32, 1:4] 0.14289 -0.01175 -0.01633 0.00872 -0.13143 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## .. ..$ : chr [1:4] "MR1" "MR2" "MR4" "MR3"
## $ rotation : chr "oblimin"
## $ communality : Named num [1:32] 0.48 0.408 0.363 0.407 0.433 ...
## ..- attr(*, "names")= chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## $ communalities: Named num [1:32] 0.48 0.408 0.363 0.407 0.433 ...
## ..- attr(*, "names")= chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## $ uniquenesses : Named num [1:32] 0.52 0.592 0.637 0.593 0.567 ...
## ..- attr(*, "names")= chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## $ values : num [1:32] 6.37 2.96 2.4 1.74 0.78 ...
## $ e.values : num [1:32] 6.91 3.5 2.96 2.28 1.45 ...
## $ loadings : 'loadings' num [1:32, 1:4] 0.6746 -0.0849 -0.1127 0.0182 -0.5985 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## .. ..$ : chr [1:4] "MR1" "MR2" "MR4" "MR3"
## $ model : num [1:32, 1:32] 0.4798 -0.2256 -0.2024 -0.0929 -0.4516 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## .. ..$ : chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## $ fm : chr "minres"
## $ rot.mat : num [1:4, 1:4] 0.534 -0.503 -0.279 -0.764 0.308 ...
## $ Phi : num [1:4, 1:4] 1 -0.142 -0.375 0.18 -0.142 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:4] "MR1" "MR2" "MR4" "MR3"
## .. ..$ : chr [1:4] "MR1" "MR2" "MR4" "MR3"
## $ Structure : 'loadings' num [1:32, 1:4] 0.6895 -0.3145 -0.2541 -0.0802 -0.6464 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## .. ..$ : chr [1:4] "MR1" "MR2" "MR4" "MR3"
## $ method : chr "regression"
## $ R2.scores : Named num [1:4] 0.886 0.867 0.866 0.821
## ..- attr(*, "names")= chr [1:4] "MR1" "MR2" "MR4" "MR3"
## $ r : num [1:32, 1:32] 1 -0.2094 -0.1772 -0.0945 -0.4466 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## .. ..$ : chr [1:32] "Q1" "Q2" "Q3" "Q4" ...
## $ fn : chr "fa"
## $ Vaccounted : num [1:5, 1:4] 3.973 0.124 0.124 0.295 0.295 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:5] "SS loadings" "Proportion Var" "Cumulative Var" "Proportion Explained" ...
## .. ..$ : chr [1:4] "MR1" "MR2" "MR4" "MR3"
## - attr(*, "class")= chr [1:2] "psych" "fa"
# Use maximum likelihood for extracting factors.
psych::fa(hsq_polychoric, nfactors=4, fm="mle")
## Factor Analysis using method = ml
## Call: psych::fa(r = hsq_polychoric, nfactors = 4, fm = "mle")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML2 ML4 ML3 h2 u2 com
## Q1 0.67 -0.05 -0.01 0.04 0.48 0.52 1.0
## Q2 -0.09 -0.03 0.61 -0.04 0.42 0.58 1.1
## Q3 -0.12 0.13 0.08 -0.50 0.35 0.65 1.3
## Q4 0.01 0.62 0.01 0.00 0.39 0.61 1.0
## Q5 -0.60 0.07 0.08 -0.01 0.42 0.58 1.1
## Q6 -0.28 -0.03 0.42 -0.03 0.34 0.66 1.8
## Q7 -0.17 -0.03 -0.01 0.61 0.37 0.63 1.2
## Q8 -0.03 0.76 0.01 0.01 0.58 0.42 1.0
## Q9 0.47 -0.12 0.00 0.03 0.26 0.74 1.1
## Q10 0.05 0.05 0.77 -0.03 0.58 0.42 1.0
## Q11 0.13 0.00 0.15 -0.53 0.29 0.71 1.3
## Q12 -0.14 0.66 -0.04 0.05 0.45 0.55 1.1
## Q13 -0.68 0.00 0.11 0.01 0.53 0.47 1.1
## Q14 -0.18 -0.02 0.64 0.02 0.51 0.49 1.2
## Q15 0.04 -0.05 0.07 0.69 0.50 0.50 1.0
## Q16 0.10 -0.53 0.11 0.15 0.33 0.67 1.3
## Q17 0.76 -0.03 0.04 0.05 0.58 0.42 1.0
## Q18 0.11 0.00 0.80 0.00 0.58 0.42 1.0
## Q19 -0.14 0.15 0.21 -0.40 0.34 0.66 2.2
## Q20 0.11 0.79 -0.01 -0.07 0.64 0.36 1.1
## Q21 -0.66 0.13 0.13 0.17 0.55 0.45 1.3
## Q22 0.11 0.06 -0.29 0.13 0.14 0.86 1.8
## Q23 0.11 0.04 -0.01 0.51 0.29 0.71 1.1
## Q24 0.20 0.49 0.08 0.05 0.27 0.73 1.4
## Q25 0.77 0.06 -0.01 0.02 0.59 0.41 1.0
## Q26 -0.08 0.03 0.68 0.04 0.52 0.48 1.0
## Q27 0.09 0.07 0.11 -0.51 0.28 0.72 1.2
## Q28 -0.10 0.27 0.24 -0.13 0.23 0.77 2.8
## Q29 0.59 0.18 0.06 0.16 0.38 0.62 1.4
## Q30 -0.02 -0.13 0.51 0.00 0.25 0.75 1.1
## Q31 0.10 0.07 0.07 0.70 0.51 0.49 1.1
## Q32 -0.08 0.67 0.06 0.02 0.50 0.50 1.0
##
## ML1 ML2 ML4 ML3
## SS loadings 4.02 3.30 3.38 2.75
## Proportion Var 0.13 0.10 0.11 0.09
## Cumulative Var 0.13 0.23 0.33 0.42
## Proportion Explained 0.30 0.25 0.25 0.20
## Cumulative Proportion 0.30 0.54 0.80 1.00
##
## With factor correlations of
## ML1 ML2 ML4 ML3
## ML1 1.00 -0.14 -0.38 0.18
## ML2 -0.14 1.00 0.25 -0.17
## ML4 -0.38 0.25 1.00 -0.05
## ML3 0.18 -0.17 -0.05 1.00
##
## Mean item complexity = 1.3
## Test of the hypothesis that 4 factors are sufficient.
##
## The degrees of freedom for the null model are 496 and the objective function was 12.74
## The degrees of freedom for the model are 374 and the objective function was 2.18
##
## The root mean square of the residuals (RMSR) is 0.04
## The df corrected root mean square of the residuals is 0.05
##
## Fit based upon off diagonal values = 0.97
## Measures of factor score adequacy
## ML1 ML2 ML4 ML3
## Correlation of (regression) scores with factors 0.94 0.93 0.93 0.91
## Multiple R square of scores with factors 0.89 0.87 0.87 0.82
## Minimum correlation of possible factor scores 0.78 0.74 0.74 0.64
# Use PAF on hsq_polychoric.
hsq_correl_pa <- psych::fa(hsq_polychoric, nfactors=4, fm="pa")
# Sort the communalities of the f_hsq_pa.
f_hsq_pa_common <- sort(hsq_correl_pa$communality, decreasing = TRUE)
# Sort the uniqueness of the f_hsq_pa.
f_hsq_pa_unique <- sort(hsq_correl_pa$uniqueness, decreasing = TRUE)
# Check out the scree test and the Kaiser-Guttman criterion.
psych::scree(hsq_polychoric)
# Use parallel analysis for estimation with the minres extraction method.
psych::fa.parallel(hsq_polychoric, n.obs = 1069, fm = "minres", fa = "fa")
## Parallel analysis suggests that the number of factors = 7 and the number of components = NA
# Use parallel analysis for estimation with the mle extraction method.
psych::fa.parallel(hsq_polychoric, n.obs = 1069, fm = "mle", fa = "fa")
## Parallel analysis suggests that the number of factors = 7 and the number of components = NA
Chapter 4 - Advanced EFA
Interpretation of EFA and factor rotation:
Interpretation of EFA and path diagrams:
EFA case study:
Wrap up:
Example code includes:
# Check the default rotation method.
f_hsq$rotation
## [1] "oblimin"
# Try Promax, another oblique rotation method.
f_hsq_promax <- psych::fa(hsq_polychoric, nfactors=4, rotate="promax")
# Try Varimax, an orthogonal method.
f_hsq_varimax <- psych::fa(hsq_polychoric, nfactors=4, rotate="varimax")
# Check the factor loadings.
print(f_hsq$loadings, cut=0)
##
## Loadings:
## MR1 MR2 MR4 MR3
## Q1 0.675 -0.055 -0.005 0.029
## Q2 -0.085 -0.023 0.604 -0.034
## Q3 -0.113 0.130 0.082 -0.512
## Q4 0.018 0.635 0.023 0.002
## Q5 -0.599 0.066 0.095 -0.015
## Q6 -0.257 -0.038 0.462 -0.040
## Q7 -0.166 -0.030 0.002 0.607
## Q8 -0.027 0.741 0.009 0.007
## Q9 0.485 -0.124 0.011 0.016
## Q10 0.034 0.056 0.736 -0.020
## Q11 0.139 -0.018 0.163 -0.541
## Q12 -0.142 0.663 -0.045 0.054
## Q13 -0.641 0.006 0.143 0.005
## Q14 -0.173 -0.018 0.644 0.022
## Q15 0.055 -0.044 0.078 0.688
## Q16 0.123 -0.523 0.118 0.126
## Q17 0.769 -0.035 0.043 0.050
## Q18 0.107 0.005 0.780 -0.004
## Q19 -0.122 0.141 0.229 -0.412
## Q20 0.099 0.779 -0.021 -0.082
## Q21 -0.641 0.131 0.153 0.156
## Q22 0.136 0.059 -0.267 0.105
## Q23 0.124 0.036 0.004 0.491
## Q24 0.218 0.509 0.083 0.042
## Q25 0.761 0.061 -0.020 0.007
## Q26 -0.078 0.033 0.685 0.032
## Q27 0.103 0.058 0.114 -0.530
## Q28 -0.075 0.272 0.248 -0.155
## Q29 0.607 0.182 0.068 0.151
## Q30 0.005 -0.133 0.538 -0.008
## Q31 0.107 0.066 0.080 0.693
## Q32 -0.074 0.694 0.051 0.023
##
## MR1 MR2 MR4 MR3
## SS loadings 3.768 3.241 3.233 2.688
## Proportion Var 0.118 0.101 0.101 0.084
## Cumulative Var 0.118 0.219 0.320 0.404
# Create the path diagram of the latent factors.
psych::fa.diagram(f_hsq)
SD3 <- readRDS("./RInputFiles/SD3.RDS")
# SD3_mod <- SD3 %>% mutate_all(factor, levels=1:5)
sdt_sub_correl <- polycor::hetcor(SD3)
# Explore sdt_sub_correl.
str(sdt_sub_correl)
## List of 7
## $ correlations: num [1:27, 1:27] 1 0.184 0.102 0.217 0.369 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:27] "M1" "M2" "M3" "M4" ...
## .. ..$ : chr [1:27] "M1" "M2" "M3" "M4" ...
## $ type : chr [1:27, 1:27] "" "Pearson" "Pearson" "Pearson" ...
## $ NA.method : chr "complete.obs"
## $ ML : logi FALSE
## $ std.errors : num [1:27, 1:27] 0 0.0969 0.0993 0.0956 0.0868 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:27] "M1" "M2" "M3" "M4" ...
## .. ..$ : chr [1:27] "M1" "M2" "M3" "M4" ...
## $ n : int 100
## $ tests : num [1:27, 1:27] 0.00 5.78e-13 1.55e-16 8.63e-14 4.36e-14 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:27] "M1" "M2" "M3" "M4" ...
## .. ..$ : chr [1:27] "M1" "M2" "M3" "M4" ...
## - attr(*, "class")= chr "hetcor"
# Get the correlation matrix of the sdt_sub_correl.
sdt_polychoric <- sdt_sub_correl$correlations
# Apply the Bartlett test on the correlation matrix.
psych::cortest.bartlett(sdt_polychoric)
## Warning in psych::cortest.bartlett(sdt_polychoric): n not specified, 100 used
## $chisq
## [1] 1019.442
##
## $p.value
## [1] 2.054927e-66
##
## $df
## [1] 351
# Check the KMO index.
psych::KMO(sdt_polychoric)
## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = sdt_polychoric)
## Overall MSA = 0.82
## MSA for each item =
## M1 M2 M3 M4 M5 M6 M7 M8 M9 N1 N2 N3 N4 N5 N6 N7
## 0.78 0.84 0.80 0.66 0.91 0.84 0.68 0.77 0.79 0.80 0.82 0.83 0.87 0.85 0.84 0.80
## N8 N9 P1 P2 P3 P4 P5 P6 P7 P8 P9
## 0.81 0.89 0.89 0.64 0.87 0.52 0.81 0.88 0.52 0.63 0.85
# Check out the scree test.
psych::scree(sdt_polychoric)
# Use parallel analysis for estimation with the minres extraction method.
psych::fa.parallel(sdt_polychoric, n.obs = 100, fa = "fa")
## Parallel analysis suggests that the number of factors = 4 and the number of components = NA
# Perform EFA with MLE.
f_sdt <- psych::fa(sdt_polychoric, fm = "ml", nfactors = 4)
# Check the factor loadings.
print(f_sdt$loadings, cut=0)
##
## Loadings:
## ML1 ML4 ML2 ML3
## M1 0.005 0.043 0.578 -0.194
## M2 0.236 0.407 0.193 0.152
## M3 -0.019 0.654 0.023 0.091
## M4 0.029 0.329 0.254 -0.134
## M5 0.184 0.179 0.550 0.075
## M6 0.064 -0.099 0.849 0.055
## M7 0.104 0.171 0.438 -0.454
## M8 0.504 0.255 -0.025 -0.183
## M9 0.048 0.325 0.450 0.037
## N1 0.082 0.202 0.033 0.409
## N2 0.037 -0.160 -0.105 -0.501
## N3 0.221 0.056 0.012 0.615
## N4 -0.014 0.438 0.160 0.372
## N5 -0.059 0.580 0.107 0.166
## N6 -0.299 -0.300 0.104 -0.356
## N7 -0.189 0.346 0.222 0.219
## N8 -0.197 -0.058 -0.276 -0.334
## N9 0.754 -0.003 0.014 -0.017
## P1 0.411 0.012 0.296 0.053
## P2 0.001 -0.129 -0.089 -0.213
## P3 0.395 -0.008 0.220 0.020
## P4 0.015 0.104 -0.111 0.318
## P5 0.556 0.026 0.076 0.070
## P6 0.634 -0.047 0.174 0.139
## P7 -0.419 0.131 0.190 -0.016
## P8 0.101 0.594 -0.179 -0.277
## P9 0.261 0.525 -0.049 0.084
##
## ML1 ML4 ML2 ML3
## SS loadings 2.445 2.432 2.304 1.844
## Proportion Var 0.091 0.090 0.085 0.068
## Cumulative Var 0.091 0.181 0.266 0.334
# Create the path diagram of the latent factors.
psych::fa.diagram(f_sdt)
Chapter 1 - Statistical Outlier Detection
Meaning of anomalies:
Testing extremes with Grubbs’ test:
Anomalies in time series:
Example code includes:
river <- data.frame(index=1:291,
nitrate=c(1.581, 1.323, 1.14, 1.245, 1.072, 1.483, 1.162, 1.304, 1.14, 1.118, 1.342, 1.245, 1.204, 1.14, 1.204, 1.118, 1.025, 1.118, 1.285, 1.14, 0.949, 0.922, 0.949, 1.118, 1.265, 1.095, 1.183, 1.162, 1.118, 1.285, 1.049, 0.922, 0.775, 0.866, 0.922, 1.643, 1.323, 1.285, 1.095, 1.049, 1.095, 0.922, 0.866, 1.049, 0.922, 1.095, 1.183, 1.304, 1.162, 1.225, 1.285, 1.072, 1.533, 1.095, 1.396, 1.025, 0.922, 0.949, 1.118, 1.342, 1.36, 1.36, 1.204, 1.265, 1, 1.183, 1.025, 0.866, 1.072, 1.049, 1.049, 1.049, 1.095, 1.183, 1.095, 0.975, 1.118, 0.975, 1.049, 0.837, 0.922, 1.118, 1.072, 1.204, 0.975, 1.095, 1.049, 0.866, 0.922, 1.049, 1.127, 1.072, 0.975, 1.049, 1.183, 1.245, 1.225, 1.225, 1.265, 1.118, 1.14, 1.072, 1.095, 0.671, 1.183, 0.949, 1.162, 1.095, 1.323, 1.342, 1.277, 1.015, 1, 0.922, 0.894, 1, 1.049, 0.922, 1.517, 1.265, 1.414, 1.304, 1.14, 1.14, 1.049, 1.068, 0.906, 1.095, 0.883, 1.14, 1.025, 1.36, 1.183, 1.265, 1.304, 0.964, 0.975, 0.99, 0.877, 1.049, 0.975, 1, 1.183, 1.225, 1.265, 1.183, 1.049, 0.97, 0.894, 0.98, 0.964, 0.894, 0.922, 1.14, 1.183, 1.897, 1.095, 1.14, 1.414, 1.14, 1, 1.049, 0.889, 0.872, 1, 1.095, 0.671, 1.095, 1.14, 1.304, 1.025, 0.975, 1, 0.877, 0.949, 0.866, 1.058, 1.086, 1.118, 1.162, 1.221, 1.265, 1.122, 1.015, 1.162, 0.825, 0.906, 0.849, 0.985, 1.118, 1.077, 1.237, 1.237, 1.063, 1.01, 0.933, 0.922, 0.806, 0.748, 0.592, 0.911, 0.806, 0.98, 1.077, 1.212, 1.277, 0.954, 0.837, 0.917, 0.9, 1.068, 0.872, 0.99, 1.131, 1.068, 1.208, 1.319, 1.281, 0.905, 0.819, 0.826, 0.974, 0.888, 0.804, 0.996, 1.127, 1.17, 1.166, 1.261, 1.275, 1.179, 1.079, 0.951, 0.852, 0.872, 0.834, 0.859, 1.077, 1.095, 1.285, 1.323, 1.16, 1.125, 0.957, 0.948, 0.907, 0.89, 0.999, 0.999, 0.953, 0.9, 0.986, 1.187, 1.054, 1.079, 0.997, 0.851, 0.803, 0.971, 1.025, 1.086, 1.114, 1.068, 1.091, 1.034, 0.871, 0.781, 0.865, 0.7, 0.673, 0.881, 0.782, 0.97, 1.044, 1.17, 1.196, 1.091, 1.068, 0.967, 0.823, 0.73, 0.693, 0.788, 1.095, 1.183, 0.996, 1.105, 0.939, 0.914, 0.813, 0.775),
month=factor(month.name[c(rep(1:12, times=24), 1:3)], levels=month.name)
)
str(river)
## 'data.frame': 291 obs. of 3 variables:
## $ index : int 1 2 3 4 5 6 7 8 9 10 ...
## $ nitrate: num 1.58 1.32 1.14 1.25 1.07 ...
## $ month : Factor w/ 12 levels "January","February",..: 1 2 3 4 5 6 7 8 9 10 ...
# Explore contents of dataset
head(river)
## index nitrate month
## 1 1 1.581 January
## 2 2 1.323 February
## 3 3 1.140 March
## 4 4 1.245 April
## 5 5 1.072 May
## 6 6 1.483 June
# Summary statistics of river nitrate concentrations
summary(river$nitrate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.5920 0.9485 1.0680 1.0649 1.1700 1.8970
# Plot the distribution of nitrate concentration
boxplot(river$nitrate)
# Plot a histogram of the nitrate column
hist(river$nitrate)
# Add a Nitrate concentration label
hist(river$nitrate, xlab="Nitrate concentration")
# Separate the histogram into 40 bins
hist(river$nitrate, xlab = "Nitrate concentration", breaks = 40)
# Apply Grubbs' test to the nitrate data
outliers::grubbs.test(river$nitrate)
##
## Grubbs test for one outlier
##
## data: river$nitrate
## G = 4.72676, U = 0.92269, p-value = 0.000211
## alternative hypothesis: highest value 1.897 is an outlier
# Use which.max to find row index of the max
which.max(river$nitrate)
## [1] 156
# Runs Grubbs' test excluding row 156
outliers::grubbs.test(river$nitrate[-156])
##
## Grubbs test for one outlier
##
## data: river$nitrate[-156]
## G = 3.42983, U = 0.95915, p-value = 0.07756
## alternative hypothesis: highest value 1.643 is an outlier
# Print the value tested in the second Grubbs' test
min(river$nitrate[-156])
## [1] 0.592
# View contents of dataset
head(river)
## index nitrate month
## 1 1 1.581 January
## 2 2 1.323 February
## 3 3 1.140 March
## 4 4 1.245 April
## 5 5 1.072 May
## 6 6 1.483 June
# Show the time series of nitrate concentrations with time
plot(nitrate ~ month, data = river, type = "o")
# Calculate the mean nitrate by month
monthly_mean <- tapply(river$nitrate, river$month, FUN = mean)
monthly_mean
## January February March April May June July August
## 1.2163600 1.1838400 1.1050400 1.0166250 0.9978333 0.9792083 0.9810417 0.9380833
## September October November December
## 0.9885833 1.0360000 1.0962500 1.2264167
# Plot the monthly means
plot(monthly_mean, type = "o", xlab = "Month", ylab = "Monthly mean")
# Create a boxplot of nitrate against months
boxplot(nitrate ~ month, data=river)
# Package ‘anomalyDetection’ was removed from the CRAN repository.
# Formerly available versions can be obtained from the archive.
# Archived on 2019-03-01 as check problems were not corrected in time.
# Run Seasonal-Hybrid ESD for nitrate concentrations
# AnomalyDetectionVec(river$nitrate, period=12, direction = 'both', plot = T)
# Use Seasonal-Hybrid ESD for nitrate concentrations
# river_anomalies <- AnomalyDetectionVec(x = river$nitrate, period = 12, direction = 'both', plot = T)
# Print the anomalies
# river_anomalies$anoms
# Print the plot
# print(river_anomalies$plot)
Chapter 2 - Distance and Density Based Anomaly Detection
k-Nearest-Neighbors Score:
Visualizing kNN distance:
Local outlier factor (LOF):
Example code includes:
wineOrig <- readr::read_csv("./RInputFiles/big_wine.csv")
## Parsed with column specification:
## cols(
## fixed.acidity = col_double(),
## volatile.acidity = col_double(),
## citric.acid = col_double(),
## residual.sugar = col_double(),
## chlorides = col_double(),
## free.sulfur.dioxide = col_double(),
## total.sulfur.dioxide = col_double(),
## density = col_double(),
## pH = col_double(),
## sulphates = col_double(),
## alcohol = col_double(),
## quality = col_double(),
## good_wine = col_double()
## )
str(wineOrig, give.attr=FALSE)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 14694 obs. of 13 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : num 6 6 6 6 6 6 6 6 6 6 ...
## $ good_wine : num 1 1 1 1 1 1 1 1 1 1 ...
wine <- wineOrig %>% select(pH, alcohol)
# View the contents of the wine data
head(wine)
## # A tibble: 6 x 2
## pH alcohol
## <dbl> <dbl>
## 1 3 8.8
## 2 3.3 9.5
## 3 3.26 10.1
## 4 3.19 9.9
## 5 3.19 9.9
## 6 3.26 10.1
# Scatterplot of wine pH against alcohol
plot(pH ~ alcohol, data = wine)
# Calculate the 5 nearest neighbors distance
wine_nn <- FNN::get.knn(wine, k = 5)
# View the distance matrix
head(wine_nn$nn.dist)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0 0 0 0 0
## [2,] 0 0 0 0 0
## [3,] 0 0 0 0 0
## [4,] 0 0 0 0 0
## [5,] 0 0 0 0 0
## [6,] 0 0 0 0 0
# Distance from wine 5 to nearest neighbor
wine_nn$nn.dist[5, 1]
## [1] 0
# Row index of wine 5's nearest neighbor
wine_nn$nn.ind[5, 1]
## [1] 9800
# Return data for wine 5 and its nearest neighbor
wine[c(5, wine_nn$nn.ind[5, 1]), ]
## # A tibble: 2 x 2
## pH alcohol
## <dbl> <dbl>
## 1 3.19 9.9
## 2 3.19 9.9
# Create score by averaging distances
wine_nnd <- rowMeans(wine_nn$nn.dist)
# Print row index of the most anomalous point
which.max(wine_nnd)
## [1] 3919
# Observe differences in column scales
summary(wine)
## pH alcohol
## Min. :2.720 Min. : 8.00
## 1st Qu.:3.090 1st Qu.: 9.50
## Median :3.180 Median :10.40
## Mean :3.188 Mean :10.51
## 3rd Qu.:3.280 3rd Qu.:11.40
## Max. :3.820 Max. :14.20
# Standardize the wine columns
wine_scaled <- scale(wine)
# Observe standardized column scales
summary(wine_scaled)
## pH alcohol
## Min. :-3.10130 Min. :-2.04323
## 1st Qu.:-0.65081 1st Qu.:-0.82425
## Median :-0.05475 Median :-0.09286
## Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.60755 3rd Qu.: 0.71979
## Max. : 4.18393 Max. : 2.99522
# Print the 5-nearest neighbor distance score
wine_nnd[1:5]
## [1] 0 0 0 0 0
# Add the score as a new column
wine$score <- wine_nnd
# Scatterplot showing pH, alcohol and kNN score
plot(pH ~ alcohol, data=wine, cex = sqrt(score), pch = 20)
# Calculate the LOF for wine data
wine$score <- NULL
wine_lof <- dbscan::lof(scale(wine), k=5)
# Append the LOF score as a new column
wine$score <- wine_lof
# Scatterplot showing pH, alcohol and LOF score
plot(pH ~ alcohol, data=wine, cex=score, pch=20)
# Calculate and append kNN distance as a new column
wine_nn <- FNN::get.knn(wine_scaled, k = 10)
wine$score_knn <- rowMeans(wine_nn$nn.dist)
# Calculate and append LOF as a new column
wine$score_lof <- dbscan::lof(wine_scaled, k = 10)
# Find the row location of highest kNN
which.max(wine$score_knn)
## [1] 2957
# Find the row location of highest LOF
which.max(wine$score_lof)
## [1] 15
Chapter 3 - Isolation Forest
Isolation Trees:
Isolation Forest:
Visualizing Isolation Scores:
Example code includes:
wine <- wine %>% select(pH, alcohol)
str(wine, give.attr=FALSE)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 14694 obs. of 2 variables:
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ alcohol: num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
# CRAN - package ‘isofor’ is not available (for R version 3.5.1)
# Build an isolation tree
# wine_tree <- iForest(wine, nt = 1)
# Create isolation score
# wine$tree_score <- predict(wine_tree, newdata = wine)
# Histogram plot of the scores
# hist(wine$tree_score, breaks=40)
# Fit isolation forest
# wine_forest <- iForest(wine, nt=100)
# Fit isolation forest
# wine_forest <- iForest(wine, nt = 100, phi = 200)
# Create isolation score from forest
# wine_score <- predict(wine_forest, newdata=wine)
# Append score to the wine data
# wine$score <- wine_score
# View the contents of the wine scores
# head(wine_scores)
# Score scatterplot 2000 vs 1000 trees
# plot(trees_2000 ~ trees_1000, data = wine_scores)
# Add reference line of equality
# abline(a = 0, b = 1)
# Sequence of values for pH and alcohol
ph_seq <- seq(min(wine$pH), max(wine$pH), length.out = 25)
alcohol_seq <- seq(min(wine$alcohol), max(wine$alcohol) , length.out = 25)
# Create a data frame of grid coordinates
wine_grid <- expand.grid(pH = ph_seq, alcohol = alcohol_seq)
# Plot the grid
plot(pH ~ alcohol, data=wine_grid, pch = 20)
# Calculate isolation score at grid locations
# wine_grid$score <- predict(wine_forest, newdata=wine_grid)
# Contour plot of isolation scores
# contourplot(score ~ alcohol + pH, data=wine_grid, region = TRUE)
Chapter 4 - Comparing Performance
Labeled Anomalies:
Measuring Performance:
Working with Categorical Features:
Wrap Up:
Example code includes:
thyroidOrig <- readr::read_csv("./RInputFiles/thyroid.csv")
## Parsed with column specification:
## cols(
## label = col_double(),
## TSH = col_double(),
## T3 = col_double(),
## TT4 = col_double(),
## T4U = col_double(),
## FTI = col_double(),
## TBG = col_double()
## )
str(thyroidOrig, give.attr=FALSE)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of 7 variables:
## $ label: num 0 0 0 0 0 0 0 0 0 0 ...
## $ TSH : num -0.256 -1.397 -0.704 -0.389 -1.442 ...
## $ T3 : num -6.78 -7.66 -5.63 -6.38 -7.66 ...
## $ TT4 : num -1.98 -1.27 -1.5 -1.85 -1.42 ...
## $ T4U : num -1.29 -1.11 -1.45 -1.74 -1.14 ...
## $ FTI : num -1.218 -0.625 -0.643 -1.099 -1.099 ...
## $ TBG : num -1.44 -1.75 -2.08 -1.99 -1.4 ...
thyroid <- thyroidOrig
# View contents of thryoid data
head(thyroid)
## # A tibble: 6 x 7
## label TSH T3 TT4 T4U FTI TBG
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 -0.256 -6.78 -1.98 -1.29 -1.22 -1.44
## 2 0 -1.40 -7.66 -1.27 -1.11 -0.625 -1.75
## 3 0 -0.704 -5.63 -1.50 -1.45 -0.643 -2.08
## 4 0 -0.389 -6.38 -1.85 -1.74 -1.10 -1.99
## 5 0 -1.44 -7.66 -1.42 -1.14 -1.10 -1.40
## 6 0 -0.313 -7.66 -1.92 -1.63 -1.43 -1.62
# Tabulate the labels
table(thyroid$label)
##
## 0 1
## 978 22
# Proportion of thyroid cases
prop_disease <- mean(thyroid$label)
# Plot of TSH and T3
plot(TSH ~ T3, data=thyroid, pch=20)
# Plot of TSH, T3 and labels
plot(TSH ~ T3, data = thyroid, pch = 20, col = label + 1)
# Plot of TT4, TBG and labels
plot(TT4 ~ TBG, data = thyroid, pch = 20, col = label + 1)
# Package isofor not available on CRAN
# Fit isolation forest
# thyroid_forest <- isofor::iForest(thyroid[, -1], nt = 200)
# Anomaly score
# thyroid$iso_score <- predict(thyroid_forest, thyroid[, -1])
# Boxplot of the anomaly score against labels
# boxplot(iso_score ~ label, data=thyroid, col = "olivedrab4")
# Create a LOF score for thyroid
lof_score <- dbscan::lof(scale(thyroid[, -1]), k = 10)
# Calculate high threshold for lof_score
high_lof <- quantile(lof_score, probs = 0.98)
# Append binary LOF score to thyroid data
thyroid$binary_lof <- as.numeric(lof_score >= high_lof)
iso_score <- c(394, 442, 408, 369, 431, 420, 398, 374, 384, 452, 478, 461, 356, 357, 405, 437, 357, 366, 488, 671, 395, 367, 346, 387, 354, 386, 411, 548, 423, 344, 355, 459, 413, 389, 373, 360, 520, 382, 690, 676, 388, 486, 530, 561, 423, 409, 352, 441, 395, 416, 367, 377, 426, 418, 378, 357, 422, 431, 526, 380, 450, 434, 462, 360, 529, 382, 390, 371, 385, 382, 367, 416, 384, 400, 377, 391, 380, 403, 361, 355, 418, 498, 649, 465, 413, 377, 383, 375, 422, 360, 353, 380, 569, 430, 377, 418, 374, 413, 369, 378, 456, 357, 559, 375, 370, 543, 410, 548, 380, 382, 362, 390, 460, 438, 392, 742, 665, 400, 393, 382, 382, 511, 375, 363, 422, 399, 358, 448, 399, 450, 392, 369, 435, 437, 375, 529, 370, 369, 429, 532, 485, 439, 429, 363, 366, 480, 408, 622, 358, 489, 520, 393, 388, 431, 378, 400, 400, 354, 405, 388, 416, 442, 382, 348, 347, 375, 366, 397, 467, 518, 387, 376, 353, 369, 442, 380, 391, 390, 358, 401, 409, 414, 452, 377, 362, 360, 380, 381, 412, 412, 418, 381, 432, 391, 448, 395, 418, 509, 525, 398, 432, 359, 499, 444, 383, 405, 467, 418, 721, 399, 421, 527, 481, 371, 364, 459, 398, 373, 388, 434, 428, 439, 381, 405, 352, 363, 352, 403, 362, 396, 367, 365, 432, 392, 396, 367, 404, 384, 381, 364, 366, 376, 369, 379, 379, 426, 401, 380, 404, 394, 368, 361, 393, 455, 396, 540, 368, 360, 466, 365, 377, 411, 442, 408, 373, 394, 344, 352, 345, 344, 346, 344, 378, 366, 401, 436, 366, 367, 382, 356, 362, 402, 405, 376, 368, 381, 371, 391, 359, 707, 367, 370, 387, 385, 373, 354, 354, 362, 358, 364, 353, 365, 374, 385, 395, 362, 461, 374, 362, 456, 405, 426, 385, 387, 387, 375, 503, 378, 370, 358, 377, 461, 357, 353, 346, 350, 393, 456, 425, 418, 371, 380, 477, 383, 382, 349, 360, 412, 395, 409, 441, 371, 420, 455, 358, 654, 365, 507, 508, 443, 364, 381, 468, 368, 362, 454, 381, 357, 432, 374, 379, 383, 389, 367, 393, 424, 378, 361, 512, 449, 522, 352, 354, 367, 359, 396, 486, 367, 409, 427, 351, 381, 357, 362, 369, 364, 356, 730, 353, 399, 383, 523, 429, 425, 420, 455, 414, 475, 433, 528, 425, 476, 352, 350, 413, 443, 435, 381, 472, 486, 376, 402, 361, 377, 391, 380, 355, 400, 394, 353, 379, 376, 469, 398, 464, 388, 378, 397, 396, 521, 417, 365, 420, 377, 350, 407, 364, 368, 426, 344, 351, 411, 412, 502, 381, 495, 350, 350, 344, 362, 389, 388, 370, 354, 394, 363, 564, 549, 387, 378, 411, 421, 427, 382, 385, 496, 372, 416, 365, 375, 406, 355, 362, 442, 410, 477, 361, 379, 386, 375, 351, 351, 360, 360, 412, 400, 409, 458, 351, 376, 400, 360, 499, 362, 476, 396, 407, 437, 358, 385, 373, 432, 353, 352, 369, 405, 376, 383, 462, 375, 361, 395, 426, 431, 418, 500, 585, 616, 372, 529, 418, 456, 360, 429, 397, 366, 384, 359, 515, 401, 389, 429, 371, 357, 398, 380, 371, 354, 403, 355, 356, 368, 363, 481, 545, 367, 350, 345, 344, 344, 426, 438, 464, 365, 460, 462, 419, 358, 428, 433, 352, 384, 416, 387)
iso_score <- c(iso_score, 384, 366, 367, 487, 628, 638, 472, 349, 351, 351, 420, 422, 347, 347, 422, 396, 443, 419, 509, 507, 398, 375, 427, 492, 388, 387, 354, 390, 439, 358, 392, 379, 361, 392, 375, 407, 663, 442, 390, 437, 432, 420, 397, 413, 477, 494, 522, 354, 354, 357, 381, 384, 412, 406, 411, 448, 508, 411, 352, 345, 511, 386, 364, 396, 476, 389, 355, 464, 363, 380, 366, 423, 396, 407, 415, 504, 440, 406, 449, 394, 432, 397, 428, 434, 401, 363, 395, 404, 392, 454, 357, 380, 352, 382, 389, 389, 345, 348, 461, 390, 371, 345, 442, 402, 386, 375, 382, 382, 404, 373, 586, 426, 600, 368, 382, 358, 407, 379, 402, 367, 366, 385, 706, 352, 384, 363, 486, 366, 433, 373, 397, 434, 402, 378, 376, 376, 359, 380, 363, 351, 543, 435, 385, 503, 359, 353, 365, 405, 457, 345, 463, 445, 363, 353, 369, 370, 355, 681, 439, 360, 417, 383, 376, 416, 428, 386, 426, 420, 462, 370, 367, 398, 373, 354, 418, 364, 357, 420, 628, 442, 403, 478, 370, 367, 399, 413, 453, 423, 376, 385, 415, 447, 349, 551, 390, 438, 384, 401, 458, 526, 449, 480, 405, 388, 391, 361, 362, 387, 429, 391, 413, 391, 380, 511, 411, 376, 374, 436, 362, 434, 437, 517, 397, 406, 372, 345, 345, 345, 347, 423, 639, 373, 397, 358, 369, 399, 464, 453, 406, 358, 350, 395, 386, 454, 396, 373, 394, 444, 377, 376, 459, 393, 353, 349, 685, 382, 419, 394, 446, 346, 442, 426, 390, 422, 568, 365, 443, 353, 513, 364, 349, 373, 422, 389, 509, 411, 443, 375, 438, 556, 349, 445, 446, 413, 455, 419, 385, 358, 381, 375, 372, 497, 589, 386, 493, 539, 769, 351, 511, 456, 373, 378, 411, 523, 448, 400, 368, 428, 381, 444, 378, 402, 377, 411, 367, 446, 374, 435, 429, 409, 399, 349, 360, 468, 400, 366, 372, 446, 384, 524, 348, 384, 371, 381, 347, 357, 369, 359, 405, 390, 363, 419, 469, 410, 413, 365, 377, 482, 398, 347, 467, 446, 442, 399, 367, 502, 424, 452, 364, 372, 355, 386, 399, 399, 370, 409, 412, 409, 396, 380, 446, 470, 375, 386, 454, 350, 514, 396, 411, 402, 360, 458, 439, 349, 345, 398, 368, 378, 355, 528, 384, 397, 543, 410, 370, 389, 506, 412, 454, 442, 602, 383, 367, 377, 489, 371, 471, 361, 366, 355, 508, 368, 390, 368, 375, 406, 512, 374, 380, 378, 344, 381, 400, 544, 375, 527, 390, 398, 455, 393, 427, 435, 512, 379, 367, 380)
iso_score <- iso_score / 1000
# Calculate high threshold for iso_score
high_iso <- quantile(iso_score, probs=0.98)
# Append binary isolation score to thyroid data
thyroid$binary_iso <- as.numeric(iso_score >= high_iso)
# Tabulate agreement of label and binary isolation score
table(thyroid$label, thyroid$binary_iso)
##
## 0 1
## 0 970 8
## 1 10 12
# Tabulate agreement of label and binary LOF score
table(thyroid$label, thyroid$binary_lof)
##
## 0 1
## 0 958 20
## 1 22 0
# Proportion of binary_iso and label that agree
iso_prop <- mean(thyroid$label == thyroid$binary_iso)
# Proportion of binary_lof and label that agree
lof_prop <- mean(thyroid$label == thyroid$binary_lof)
table(thyroid$label, thyroid$binary_iso)
##
## 0 1
## 0 970 8
## 1 10 12
table(thyroid$label, thyroid$binary_lof)
##
## 0 1
## 0 958 20
## 1 22 0
# Precision for binary scores
precision_iso <- sum(thyroid$label == 1 & thyroid$binary_iso == 1) / sum(thyroid$binary_iso == 1)
precision_lof <- sum(thyroid$label == 1 & thyroid$binary_lof == 1) / sum(thyroid$binary_lof == 1)
# Recall for binary scores
recall_iso <- sum(thyroid$label == 1 & thyroid$binary_iso == 1) / sum(thyroid$label == 1)
recall_lof <- sum(thyroid$label == 1 & thyroid$binary_lof == 1) / sum(thyroid$label == 1)
age <- c('35-60', '0-35', '35-60', '60+', '0-35', '0-35', '0-35', '0-35', '35-60', '35-60', '35-60', '35-60', '0-35', '35-60', '35-60', '35-60', '0-35', '0-35', '35-60', '60+', '60+', '35-60', '60+', '35-60', '35-60', '60+', '60+', '0-35', '60+', '35-60', '35-60', '0-35', '0-35', '60+', '35-60', '60+', '60+', '60+', '60+', '35-60', '0-35', '0-35', '0-35', '60+', '0-35', '0-35', '0-35', '35-60', '0-35', '60+', '35-60', '60+', '0-35', '35-60', '0-35', '35-60', '35-60', '35-60', '35-60', '35-60', '0-35', '0-35', '35-60', '35-60', '60+', '60+', '35-60', '35-60', '0-35', '0-35', '0-35', '35-60', '35-60', '35-60', '0-35', '35-60', '35-60', '60+', '0-35', '0-35', '60+', '35-60', '0-35', '35-60', '0-35', '0-35', '60+', '0-35', '0-35', '0-35', '35-60', '60+', '35-60', '35-60', '35-60', '60+', '0-35', '35-60', '60+', '0-35', '35-60', '0-35', '35-60', '35-60', '60+', '60+', '60+', '35-60', '60+', '35-60', '60+', '60+', '0-35', '35-60', '60+', '60+', '0-35', '60+', '35-60', '60+', '0-35', '35-60', '0-35', '35-60', '0-35', '0-35', '35-60', '35-60', '60+', '60+', '60+', '0-35', '60+', '0-35', '0-35', '0-35', '60+', '60+', '0-35', '35-60', '35-60', '0-35', '0-35', '60+', '0-35', '60+', '35-60', '35-60', '0-35', '60+', '60+', '0-35', '0-35', '0-35', '35-60', '0-35', '0-35', '0-35', '60+', '0-35', '60+', '35-60', '35-60', '35-60', '60+', '0-35', '60+', '60+', '60+', '35-60', '35-60', '60+', '60+', '60+', '60+', '35-60', '0-35', '0-35', '35-60', '35-60', '35-60', '0-35', '35-60', '35-60', '35-60', '35-60', '60+', '60+', '60+', '0-35', '0-35', '0-35', '0-35', '35-60', '60+', '35-60', '35-60', '0-35', '60+', '60+', '0-35', '35-60', '35-60', '60+', '0-35', '60+', '60+', '60+', '0-35', '60+', '60+', '60+', '60+', '35-60', '0-35', '60+', '60+', '35-60', '60+', '0-35', '0-35', '60+', '60+', '60+', '0-35', '0-35', '35-60', '60+', '60+', '35-60', '35-60', '35-60', '60+', '0-35', '60+', '0-35', '60+', '35-60', '60+', '60+', '0-35', '35-60', '35-60', '0-35', '35-60', '60+', '0-35', '60+')
age <- c(age, '60+', '60+', '0-35', '60+', '35-60', '0-35', '0-35', '60+', '35-60', '35-60', '0-35', '60+', '60+', '0-35', '60+', '35-60', '0-35', '35-60', '0-35', '35-60', '0-35', '60+', '0-35', '60+', '60+', '0-35', '60+', '60+', '60+', '60+', '60+', '0-35', '0-35', '60+', '0-35', '60+', '0-35', '60+', '60+', '35-60', '35-60', '60+', '60+', '60+', '60+', '60+', '35-60', '35-60', '60+', '0-35', '35-60', '0-35', '35-60', '0-35', '35-60', '35-60', '60+', '60+', '0-35', '35-60', '0-35', '60+', '0-35', '35-60', '60+', '35-60', '60+', '0-35', '0-35', '0-35', '0-35', '35-60', '60+', '35-60', '0-35', '60+', '60+', '0-35', '0-35', '35-60', '0-35', '60+', '0-35', '35-60', '60+', '0-35', '0-35', '60+', '35-60', '60+', '0-35', '60+', '0-35', '60+', '35-60', '0-35', '35-60', '0-35', '60+', '60+', '0-35', '60+', '60+', '60+', '35-60', '35-60', '35-60', '35-60', '60+', '60+', '60+', '60+', '35-60', '0-35', '60+', '0-35', '0-35', '35-60', '35-60', '35-60', '60+', '60+', '60+', '35-60', '35-60', '0-35', '35-60', '35-60', '60+', '0-35', '35-60', '0-35', '35-60', '35-60', '60+', '60+', '0-35', '35-60', '0-35', '35-60', '0-35', '0-35', '35-60', '60+', '0-35', '60+', '0-35', '60+', '0-35', '35-60', '0-35', '60+', '60+', '60+', '35-60', '60+', '60+', '35-60', '60+', '0-35', '0-35', '0-35', '60+', '60+', '60+', '35-60', '0-35', '0-35', '35-60', '0-35', '0-35', '35-60', '35-60', '35-60', '60+', '60+', '0-35', '60+', '0-35', '60+', '35-60', '35-60', '0-35', '0-35', '60+', '0-35', '35-60', '0-35', '0-35', '0-35', '0-35', '60+', '35-60', '60+', '35-60', '0-35', '60+', '35-60', '35-60', '60+', '35-60', '0-35', '60+', '60+', '35-60', '0-35', '60+', '35-60', '60+', '0-35', '60+', '0-35', '35-60', '0-35', '0-35', '35-60', '35-60', '0-35', '35-60', '60+', '35-60', '35-60', '60+', '60+', '0-35', '35-60', '0-35', '60+', '0-35', '35-60', '0-35', '0-35', '60+', '0-35', '60+', '0-35', '60+', '60+', '35-60', '35-60', '60+', '0-35', '35-60', '0-35', '35-60', '0-35', '35-60', '35-60', '35-60', '0-35', '60+', '35-60', '60+', '60+', '60+', '0-35', '0-35', '35-60', '0-35', '0-35', '0-35', '60+', '60+', '60+', '0-35', '0-35', '0-35', '60+', '0-35', '35-60', '0-35', '35-60', '35-60', '60+', '60+', '60+', '0-35', '35-60', '35-60', '35-60', '60+', '60+', '60+', '60+', '35-60', '60+', '0-35', '60+', '35-60', '0-35', '35-60', '60+', '35-60', '0-35', '35-60', '35-60', '0-35', '0-35', '35-60', '60+', '60+', '35-60', '60+', '35-60', '0-35', '0-35', '0-35', '0-35', '60+', '60+', '60+', '60+', '35-60', '35-60', '35-60', '0-35', '0-35', '35-60', '60+', '0-35', '35-60', '60+', '0-35', '35-60', '35-60', '0-35', '35-60', '60+', '35-60', '35-60', '60+', '60+', '35-60', '0-35', '60+', '35-60', '0-35', '35-60', '35-60', '60+', '35-60', '60+', '60+', '60+', '60+', '0-35', '0-35', '60+', '35-60', '0-35', '60+', '35-60', '60+', '60+', '60+', '35-60', '0-35', '0-35', '60+', '0-35', '35-60', '35-60', '35-60', '60+', '0-35', '35-60', '60+', '35-60', '0-35', '60+', '0-35', '35-60')
age <- c(age, '60+', '35-60', '60+', '0-35', '60+', '35-60', '35-60', '0-35', '35-60', '60+', '60+', '60+', '60+', '60+', '0-35', '60+', '60+', '35-60', '60+', '60+', '60+', '0-35', '60+', '35-60', '60+', '35-60', '35-60', '0-35', '35-60', '0-35', '35-60', '0-35', '35-60', '60+', '0-35', '0-35', '35-60', '0-35', '35-60', '35-60', '60+', '60+', '60+', '60+', '0-35', '0-35', '0-35', '35-60', '35-60', '60+', '35-60', '35-60', '35-60', '0-35', '35-60', '0-35', '35-60', '35-60', '60+', '35-60', '60+', '35-60', '35-60', '0-35', '0-35', '35-60', '60+', '0-35', '0-35', '60+', '60+', '35-60', '0-35', '60+', '35-60', '60+', '60+', '0-35', '0-35', '35-60', '0-35', '35-60', '60+', '0-35', '0-35', '0-35', '60+', '35-60', '60+', '60+', '35-60', '35-60', '60+', '0-35', '60+', '60+', '60+', '35-60', '60+', '60+', '60+', '0-35', '60+', '60+', '0-35', '0-35', '0-35', '60+', '0-35', '0-35', '0-35', '60+', '60+', '60+', '0-35', '60+', '60+', '0-35', '35-60', '35-60', '35-60', '0-35', '35-60', '35-60', '35-60', '0-35', '35-60', '35-60', '35-60', '35-60', '60+', '60+', '35-60', '35-60', '0-35', '60+', '0-35', '35-60', '35-60', '35-60', '0-35', '35-60', '35-60', '60+', '60+', '60+', '0-35', '60+', '0-35', '0-35', '0-35', '0-35', '0-35', '0-35', '0-35', '0-35', '60+', '60+', '35-60', '0-35', '0-35', '0-35', '35-60', '35-60', '35-60', '60+', '0-35', '0-35', '35-60', '35-60', '35-60', '60+', '60+', '60+', '60+', '35-60', '0-35', '0-35', '60+', '60+', '35-60', '35-60', '60+', '0-35', '60+', '35-60', '35-60', '0-35', '0-35', '0-35', '0-35', '35-60', '0-35', '60+', '0-35', '35-60', '60+', '60+', '0-35', '35-60', '35-60', '60+', '35-60', '0-35', '0-35', '0-35', '60+', '0-35', '60+', '0-35', '0-35', '60+', '0-35', '35-60', '35-60', '35-60', '35-60', '0-35', '0-35', '35-60', '60+', '0-35', '60+', '0-35', '0-35', '60+', '60+', '0-35', '60+', '35-60', '35-60', '0-35', '0-35', '35-60', '60+', '0-35', '60+', '60+', '60+', '35-60', '35-60', '0-35', '60+', '60+', '35-60', '60+', '60+', '60+', '35-60', '0-35', '35-60', '60+', '60+', '35-60', '0-35', '60+', '0-35', '35-60', '35-60', '60+', '60+', '35-60', '0-35', '60+', '60+', '35-60', '35-60', '0-35', '35-60', '35-60', '0-35', '0-35', '60+', '0-35', '60+', '35-60', '35-60', '60+', '0-35', '60+', '60+', '35-60', '0-35', '0-35', '0-35', '60+', '0-35', '35-60', '35-60', '0-35', '0-35', '35-60', '35-60', '60+', '35-60', '60+', '60+', '0-35', '35-60', '0-35', '0-35', '35-60', '60+', '0-35', '35-60', '35-60', '60+', '60+', '35-60', '0-35', '0-35', '60+', '60+', '0-35', '0-35', '60+', '60+', '60+', '35-60', '0-35', '0-35', '0-35', '0-35', '60+', '60+', '0-35', '35-60', '60+', '0-35', '35-60', '60+', '60+', '35-60', '60+', '60+', '35-60', '0-35', '60+', '60+', '0-35', '35-60', '35-60', '35-60', '60+', '60+', '0-35', '60+', '35-60', '0-35', '0-35', '0-35', '35-60', '35-60', '60+', '0-35', '35-60', '0-35', '35-60', '35-60', '35-60', '60+', '0-35', '35-60', '60+', '60+', '60+', '35-60', '0-35', '0-35', '0-35', '0-35', '0-35', '60+', '60+', '35-60', '60+', '0-35', '35-60', '0-35', '60+')
sex <- c('F', 'F', 'M', 'F', 'F', 'M', 'M', 'M', 'F', 'F', 'F', 'F', 'F', 'M', 'F', 'F', 'F', 'F', 'F', 'F', 'M', 'F', 'F', 'M', 'F', 'M', 'M', 'F', 'F', 'F', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'M', 'F', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'F', 'F', 'M', 'F', 'M', 'M', 'M', 'M', 'F', 'F', 'M', 'F', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'F', 'M', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'M', 'F', 'F', 'F', 'F', 'M', 'F', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'F', 'F', 'M', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'F', 'M', 'M', 'M', 'M', 'F', 'M', 'F', 'F', 'F', 'F', 'M', 'F', 'F', 'M', 'F', 'M', 'M', 'M', 'F', 'M', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'M', 'F', 'F', 'M', 'F', 'M', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'F', 'M', 'F', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'F', 'F', 'F', 'F', 'M', 'M', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'M', 'M', 'M', 'F', 'M', 'F', 'F', 'M', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'F', 'M', 'M', 'M', 'M', 'F', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'F', 'M', 'M', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'F', 'F', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'M', 'M', 'F', 'M', 'F', 'M', 'M', 'F', 'F', 'F', 'M', 'F', 'M', 'M', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'M', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'F', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'F', 'M', 'F', 'M', 'M', 'M', 'F', 'F', 'M', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'F', 'F', 'F', 'F', 'M', 'M', 'M', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'F', 'F', 'F', 'M', 'M', 'F', 'F', 'F', 'F', 'F', 'M', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'F', 'M', 'F', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'F', 'F', 'M', 'M', 'M', 'M', 'F', 'F', 'F', 'F', 'M', 'F', 'M', 'M', 'M', 'M', 'F', 'F', 'M', 'F', 'M', 'M', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'F', 'F', 'F', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'F', 'F', 'F', 'F', 'F')
sex <- c(sex, 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'F', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'F', 'F', 'F', 'F', 'F', 'M', 'F', 'F', 'F', 'M', 'F', 'M', 'F', 'F', 'M', 'F', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'F', 'F', 'F', 'M', 'M', 'F', 'F', 'F', 'F', 'F', 'F', 'M', 'F', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'F', 'M', 'M', 'M', 'F', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'M', 'F', 'F', 'M', 'F', 'F', 'F', 'M', 'F', 'M', 'F', 'F', 'M', 'F', 'F', 'F', 'M', 'F', 'F', 'F', 'M', 'F', 'F', 'M', 'M', 'F', 'M', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'M', 'F', 'M', 'F', 'F', 'M', 'M', 'M', 'F', 'M', 'F', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'F', 'F', 'F', 'F', 'F', 'M', 'F', 'F', 'F', 'F', 'F', 'M', 'M', 'F', 'F', 'F', 'M', 'M', 'F', 'M', 'M', 'M', 'F', 'F', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'F', 'F', 'F', 'M', 'M', 'F', 'F', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'F', 'F', 'M', 'M', 'F', 'F', 'F', 'M', 'F', 'F', 'F', 'F', 'M', 'M', 'M', 'F', 'F', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'M', 'F', 'F', 'F', 'M', 'M', 'M', 'F', 'F', 'M', 'F', 'M', 'F', 'F', 'M', 'F', 'F', 'M', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'M', 'F', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'F', 'F', 'M', 'F', 'F', 'F', 'M', 'F', 'F', 'F', 'M', 'M', 'M', 'F', 'M', 'F', 'M', 'M', 'M', 'F', 'F', 'F', 'F', 'F', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'M', 'F', 'F', 'M', 'F', 'F', 'F', 'F', 'M', 'M', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'F', 'F', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'F', 'F', 'F', 'M', 'F', 'F', 'F', 'F', 'F', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'F', 'F', 'M', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'F', 'F', 'F', 'F', 'M', 'M', 'F', 'M', 'M', 'F', 'F', 'F', 'F', 'M', 'F', 'M', 'F', 'M', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'F', 'F', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'F', 'M', 'F', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'F', 'M', 'F', 'F', 'F', 'F', 'M', 'M', 'F', 'F', 'F', 'F', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'M', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'F', 'F', 'F', 'F', 'M', 'F', 'F', 'F', 'M', 'F', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'M', 'F', 'F', 'M', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'F', 'F', 'F', 'F', 'F', 'F', 'M', 'M', 'M', 'F', 'M', 'F', 'F', 'M', 'F', 'F', 'F', 'F', 'M', 'M', 'M', 'F')
thyroid$age <- age
thyroid$sex <- sex
# Print the column classes in thyroid
sapply(X = thyroid, FUN = class)
## label TSH T3 TT4 T4U FTI
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## TBG binary_lof binary_iso age sex
## "numeric" "numeric" "numeric" "character" "character"
# Convert column with character class to factor
thyroid$age <- as.factor(thyroid$age)
thyroid$sex <- as.factor(thyroid$sex)
# Check that all columns are factor or numeric
sapply(X = thyroid, FUN = class)
## label TSH T3 TT4 T4U FTI TBG
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## binary_lof binary_iso age sex
## "numeric" "numeric" "factor" "factor"
# Check the class of age column
class(thyroid$age)
## [1] "factor"
# Check the class of sex column
class(thyroid$sex)
## [1] "factor"
# Fit an isolation forest with 100 trees
# thyroid_for <- iForest(thyroid[, -1], nt=100)
# Calculate Gower's distance matrix
thyroid_dist <- cluster::daisy(thyroid[, -1], metric = "gower")
## Warning in cluster::daisy(thyroid[, -1], metric = "gower"): binary variable(s)
## 7, 8 treated as interval scaled
# Generate LOF scores for thyroid data
thyroid_lof <- dbscan::lof(thyroid_dist, k = 10)
# Range of values in the distance matrix
range(as.matrix(thyroid_dist))
## [1] 0.0000000 0.6718958
Chapter 1 - Standard GARCH Model as the Workhorse
Analyzing volatility:
GARCH equation for volatility prediction:
predvar[t] <- omega + alpha * e2[t - 1] + beta * predvar[t-1] # GARCH(1,1) equation rugarch package:
Example code includes:
library(xts)
library(PerformanceAnalytics)
load("./RInputFiles/sp500prices.RData")
str(sp500prices)
# Plot daily S&P 500 prices
plot(sp500prices)
# Compute daily returns
sp500ret <- CalculateReturns(sp500prices)
# Check the class of sp500ret
class(sp500ret)
# Plot daily returns
plot(sp500ret)
# Compute the daily standard deviation for the complete sample
sd(sp500ret)
# Compute the annualized volatility for the complete sample
sd(sp500ret) * sqrt(252)
# Compute the annualized standard deviation for the year 2009
sqrt(252) * sd(sp500ret["2009"])
# Compute the annualized standard deviation for the year 2017
sqrt(252) * sd(sp500ret["2017"])
# Showing two plots onthe same figure
par(mfrow=c(2,1))
# Compute the rolling 1 month estimate of annualized volatility
chart.RollingPerformance(R = sp500ret["2000::2017"], width = 22,
FUN = "sd.annualized", scale = 252, main = "One month rolling volatility")
# Compute the rolling 3 months estimate of annualized volatility
chart.RollingPerformance(R = sp500ret["2000::2017"], width = 66,
FUN = "sd.annualized", scale = 252, main = "Three months rolling volatility")
par(mfrow=c(1,1))
sp500ret <- sp500ret[2:length(sp500ret), ]
# Compute the mean daily return
m <- mean(sp500ret)
# Define the series of prediction errors
e <- sp500ret - m
# Plot the absolute value of the prediction errors
par(mfrow = c(2,1), mar = c(3, 2, 2, 2))
plot(abs(e))
# Plot the acf of the absolute prediction errors
acf(abs(e))
par(mfrow = c(1,1), mar = c(5.1, 4.1, 4.1, 2.1))
nobs <- length(sp500ret)
predvar <- numeric(nobs)
omega <- 1.2086e-05
alpha <- 0.1
beta <- 0.8
e2 <- e**2
# Compute the predicted variances
predvar[1] <- var(sp500ret)
for(t in 2:nobs){
predvar[t] <- omega + alpha * e2[t-1] + beta * predvar[t-1]
}
# Create annualized predicted volatility
ann_predvol <- xts(sqrt(predvar) * sqrt(252), order.by = time(sp500ret))
# Plot the annual predicted volatility in 2008 and 2009
plot(ann_predvol["2008::2009"], main = "Ann. S&P 500 vol in 2008-2009")
# Specify a standard GARCH model with constant mean
garchspec <- rugarch::ugarchspec(mean.model = list(armaOrder = c(0, 0)),
variance.model = list(model = "sGARCH"),
distribution.model = "norm")
# Estimate the model
garchfit <- rugarch::ugarchfit(data = sp500ret, spec = garchspec)
# Use the method sigma to retrieve the estimated volatilities
garchvol <- rugarch::sigma(garchfit)
# Plot the volatility for 2017
plot(garchvol["2017"])
# Compute unconditional volatility
sqrt(rugarch::uncvariance(garchfit))
# Print last 10 ones in garchvol
tail(garchvol, 10)
# Forecast volatility 5 days ahead and add
garchforecast <- rugarch::ugarchforecast(fitORspec = garchfit, n.ahead = 5)
# Extract the predicted volatilities and print them
print(rugarch::sigma(garchforecast))
# Compute the annualized volatility
annualvol <- sqrt(252) * rugarch::sigma(garchfit)
# Compute the 5% vol target weights
vt_weights <- 0.05 / annualvol
# Compare the annualized volatility to the portfolio weights in a plot
plot(merge(annualvol, vt_weights), multi.panel = TRUE)
Chapter 2 - Improvements of the Normal GARCH Model
Non-normality of standardized returns:
Leverage effect:
Mean model:
Avoid unnecessary complexity:
Example code includes:
load("./RInputFiles/ret.RData")
str(ret)
# Plot the return series
plot(ret)
# Specify the garch model to be used
garchspec <- rugarch::ugarchspec(mean.model = list(armaOrder = c(0,0)),
variance.model = list(model = "sGARCH"),
distribution.model = "sstd"
)
# Estimate the model
garchfit <- rugarch::ugarchfit(data = ret, spec = garchspec)
# Inspect the coefficients
rugarch::coef(garchfit)
# Compute the standardized returns
stdret <- rugarch::residuals(garchfit, standardize = TRUE)
# Compute the standardized returns using fitted() and sigma()
stdret <- (ret - rugarch::fitted(garchfit)) / rugarch::sigma(garchfit)
# Load the package PerformanceAnalytics and make the histogram
chart.Histogram(stdret, methods = c("add.normal","add.density" ), colorset = c("gray","red","blue"))
load("./RInputFiles/msftret.RData")
str(msftret)
# Specify the GJR GARCH model
garchspec <- rugarch::ugarchspec(mean.model = list(armaOrder = c(0,0)),
variance.model = list(model = "gjrGARCH"),
distribution.model = "sstd"
)
# Estimate the model and compute volatility
gjrgarchfit <- rugarch::ugarchfit(data = msftret, spec = garchspec)
gjrgarchvol <- rugarch::sigma(gjrgarchfit)
# Compare volatility
plotvol <- plot(abs(msftret), col = "grey")
plotvol <- addSeries(gjrgarchvol, col = "red", on=1)
# plotvol <- addSeries(sgarchvol, col = "blue", on=1)
plotvol
# Specify AR(1)-GJR GARCH model
garchspec <- rugarch::ugarchspec(mean.model = list(armaOrder = c(1, 0)),
variance.model = list(model = "gjrGARCH"),
distribution.model = "sstd"
)
# Estimate the model
garchfit <- rugarch::ugarchfit(data=msftret, spec=garchspec)
# Print the first two coefficients
rugarch::coef(garchfit)[c(1:2)]
# GARCH-in-Mean specification and estimation
gim_garchspec <- rugarch::ugarchspec(mean.model = list(armaOrder = c(0,0), archm = TRUE, archpow = 2),
variance.model = list(model = "gjrGARCH"),
distribution.model = "sstd"
)
gim_garchfit <- rugarch::ugarchfit(data = msftret , spec = gim_garchspec)
# Predicted mean returns and volatility of GARCH-in-mean
gim_mean <- rugarch::fitted(gim_garchfit)
gim_vol <- rugarch::sigma(gim_garchfit)
ar1_garchspec <- rugarch::ugarchspec(mean.model = list(armaOrder = c(1,0), archm = TRUE, archpow = 2),
variance.model = list(model = "sGARCH"),
distribution.model = "sstd"
)
ar1_garchfit <- rugarch::ugarchfit(data = msftret , spec = ar1_garchspec)
ar1_mean <- rugarch::fitted(ar1_garchfit)
ar1_vol <- rugarch::sigma(ar1_garchfit)
cmu_garchspec <- rugarch::ugarchspec(mean.model = list(armaOrder = c(0,0), archm = TRUE, archpow = 2),
variance.model = list(model = "sGARCH"),
distribution.model = "sstd"
)
cmu_garchfit <- rugarch::ugarchfit(data = msftret , spec = cmu_garchspec)
constmean_mean <- rugarch::fitted(cmu_garchfit)
constmean_vol <- rugarch::sigma(cmu_garchfit)
# Correlation between predicted return using AR(1) and GARCH-in-mean models
cor(ar1_mean, gim_mean)
# Correlation between predicted volatilities across mean.models
cor(merge(constmean_vol, ar1_vol, gim_vol))
load("./RInputFiles/EURUSDret.RData")
str(EURUSDret)
flexgarchspec <- rugarch::ugarchspec(mean.model = list(armaOrder = c(1,0), archm = FALSE),
variance.model = list(model = "sGARCH"),
distribution.model = "sstd"
)
flexgarchfit <- rugarch::ugarchfit(data = EURUSDret , spec = flexgarchspec)
# Print the flexible GARCH parameters
rugarch::coef(flexgarchfit)
# Restrict the flexible GARCH model by impose a fixed ar1 and skew parameter
rflexgarchspec <- flexgarchspec
rugarch::setfixed(rflexgarchspec) <- list(ar1 = 0, skew = 1)
# Estimate the restricted GARCH model
rflexgarchfit <- rugarch::ugarchfit(data = EURUSDret, spec = rflexgarchspec)
# Compare the volatility of the unrestricted and restriced GARCH models
plotvol <- plot(abs(EURUSDret), col = "grey")
plotvol <- addSeries(rugarch::sigma(flexgarchfit), col = "black", lwd = 4, on=1 )
plotvol <- addSeries(rugarch::sigma(rflexgarchfit), col = "red", on=1)
plotvol
# Define bflexgarchspec as the bound constrained version
bflexgarchspec <- flexgarchspec
rugarch::setbounds(bflexgarchspec) <- list(alpha1 = c(0.05, 0.2), beta1 = c(0.8, 0.95))
# Estimate the bound constrained model
bflexgarchfit <- rugarch::ugarchfit(data = EURUSDret, spec = bflexgarchspec)
# Inspect coefficients
rugarch::coef(bflexgarchfit)
# Compare forecasts for the next ten days
cbind(rugarch::sigma(rugarch::ugarchforecast(flexgarchfit, n.ahead = 10)),
rugarch::sigma(rugarch::ugarchforecast(bflexgarchfit, n.ahead = 10))
)
# Complete the specification to do variance targeting
garchspec <- rugarch::ugarchspec(mean.model = list(armaOrder = c(0,0)),
variance.model = list(model = "sGARCH", variance.targeting = TRUE),
distribution.model = "std"
)
# Estimate the model
garchfit <- rugarch::ugarchfit(data = EURUSDret, spec = garchspec)
# Print the GARCH model implied long run volatility
sqrt(rugarch::uncvariance(garchfit))
# Verify that it equals the standard deviation (after rounding)
all.equal(sqrt(rugarch::uncvariance(garchfit)), sd(EURUSDret), tol = 1e-4)
Chapter 3 - Performance Evaluation
Statistical Significance:
Goodness of Fit:
Diagnosing Absolute Standardized Returns:
Back-testing using ugarchroll:
Example code includes:
# Specify model with AR(1) dynamics, GJR GARCH and skewed student t
flexgarchspec <- rugarch::ugarchspec(mean.model = list(armaOrder = c(1,0)),
variance.model = list(model = "gjrGARCH"),
distribution.model = "sstd"
)
# Estimate the model
flexgarchfit <- rugarch::ugarchfit(data = EURUSDret, spec = flexgarchspec)
# Complete and study the statistical significance of the estimated parameters
round(flexgarchfit@fit$matcoef, 6)
# Specify model with constant mean, standard GARCH and student t
tgarchspec <- rugarch::ugarchspec(mean.model = list(armaOrder = c(1, 0)),
variance.model = list(model = "sGARCH", variance.targeting = TRUE),
distribution.model = "sstd"
)
# Fix the mu parameter at zero
rugarch::setfixed(tgarchspec) <- list("mu" = 0)
# Estimate the model
tgarchfit <- rugarch::ugarchfit(data = EURUSDret, spec = tgarchspec)
# Verify that the differences in volatility are small
plot(rugarch::sigma(tgarchfit) - rugarch::sigma(flexgarchfit))
# Compute prediction errors
garcherrors <- rugarch::residuals(flexgarchfit)
gjrerrors <- rugarch::residuals(tgarchfit)
# Compute MSE for variance prediction of garchfit model
mean((rugarch::sigma(flexgarchfit)**2 - garcherrors^2)**2)
# Compute MSE for variance prediction of gjrfit model
mean((rugarch::sigma(tgarchfit)**2 - gjrerrors^2)**2)
# Print the number of estimated parameters
length(rugarch::coef(flexgarchfit))
length(rugarch::coef(tgarchfit))
# Print likelihood of the two models
rugarch::likelihood(flexgarchfit)
rugarch::likelihood(tgarchfit)
# Print the information criteria of the two models
rugarch::infocriteria(flexgarchfit)
rugarch::infocriteria(tgarchfit)
# Compute the standardized returns
stdEURUSDret <- rugarch::residuals(tgarchfit, standardize = TRUE)
# Compute their sample mean and standard deviation
mean(stdEURUSDret)
sd(stdEURUSDret)
# Correlogram of the absolute (standardized) returns
par(mfrow = c(1, 2))
acf(abs(EURUSDret), 22)
acf(abs(stdEURUSDret), 22)
par(mfrow = c(1, 1))
# Ljung-Box test
Box.test(abs(stdEURUSDret), 22, type = "Ljung-Box")
# Estimate the model on the last 2500 observations
tgarchspec <- rugarch::ugarchspec(mean.model = list(armaOrder = c(0,0)),
variance.model = list(model = "sGARCH"),
distribution.model = "std"
)
tgarchfit <- rugarch::ugarchfit(data = tail(EURUSDret, 2500) , spec = tgarchspec)
# Compute standardized returns
stdEURUSDret <- rugarch::residuals(tgarchfit, standardize = TRUE)
# Do the Ljung-Box test on the absolute standardized returns
Box.test(abs(stdEURUSDret), 22, type = "Ljung-Box")
# Estimate the GARCH model using all the returns and compute the in-sample estimates of volatility
garchinsample <- rugarch::ugarchfit(data = sp500ret, spec = flexgarchspec)
garchvolinsample <- rugarch::sigma(garchinsample)
# Use ugarchroll for rolling estimation of the GARCH model
garchroll <- rugarch::ugarchroll(flexgarchspec, data = sp500ret,
n.start = 2000, refit.window = "moving", refit.every = 2500
)
# Set preds to the data frame with rolling predictions
preds <- rugarch::as.data.frame(garchroll)
# Compare in-sample and rolling sample volatility in one plot
garchvolroll <- xts(preds$Sigma, order.by = as.Date(rownames(preds)))
volplot <- plot(garchvolinsample, col = "darkgrey", lwd = 1.5,
main = "In-sample versus rolling vol forecasts"
)
volplot <- addSeries(garchvolroll, col = "blue", on = 1)
plot(volplot)
# Inspect the first three rows of the dataframe with out of sample predictions
head(preds, 3)
# Compute prediction errors
e <- preds$Realized - preds$Mu
d <- e^2 - preds$Sigma^2
# Compute MSE for the garchroll variance prediction
garchMSE <- mean(d^2)
# Use ugarchroll for rolling estimation of the GARCH model
gjrgarchroll <- rugarch::ugarchroll(tgarchspec, data = sp500ret,
n.start = 2000, refit.window = "moving", refit.every = 2500
)
# Compute MSE for gjrgarchroll
gjrgarchpreds <- rugarch::as.data.frame(gjrgarchroll)
e <- gjrgarchpreds$Realized - gjrgarchpreds$Mu
d <- e^2 - gjrgarchpreds$Sigma^2
gjrgarchMSE <- mean(d**2)
Chapter 4 - Applications
Value at Risk:
Production and Simulation:
Model Risk:
for (distribution.model in distribution.models) { garchspec <- ugarchspec(mean.model = list(armaOrder = c(0, 0)), variance.model = list(model = variance.model), distribution.model = distribution.model) garchfit <- ugarchfit(data = msftret, spec = garchspec) if (c==1) { msigma <- sigma(garchfit) } else { msigma <- merge(msigma, sigma(garchfit)) } c <- c + 1 }GARCH Covariance:
Wrap Up:
Example code includes:
flexgarchspec <- rugarch::ugarchspec(mean.model = list(armaOrder = c(1,0)),
variance.model = list(model = "gjrGARCH"),
distribution.model = "sstd"
)
garchroll <- rugarch::ugarchroll(flexgarchspec, data = msftret,
n.start = 2000, refit.window = "moving", refit.every = 2500
)
# Extract the dataframe with predictions from the rolling GARCH estimation
garchpreds <- rugarch::as.data.frame(garchroll)
# Extract the 5% VaR
garchVaR <- rugarch::quantile(garchroll, probs = 0.05)
# Extract the volatility from garchpreds
garchvol <- xts(garchpreds$Sigma, order.by = time(garchVaR))
# Analyze the comovement in a time series plot
garchplot <- plot(garchvol, ylim = c(-0.1, 0.1))
garchplot <- addSeries(garchVaR, on = 1, col = "blue")
plot(garchplot, main = "Daily vol and 5% VaR")
# Take a default specification a with a normal and skewed student t distribution
normgarchspec <- rugarch::ugarchspec(distribution.model = "norm")
sstdgarchspec <- rugarch::ugarchspec(distribution.model = "sstd")
# Do rolling estimation
normgarchroll <- rugarch::ugarchroll(normgarchspec, data = msftret, n.start = 2500,
refit.window = "moving", refit.every = 2000
)
sstdgarchroll <- rugarch::ugarchroll(sstdgarchspec, data = msftret, n.start = 2500,
refit.window = "moving", refit.every = 2000
)
# Compute the 5% value at risk
normgarchVaR <- rugarch::quantile(normgarchroll, probs = 0.05)
sstdgarchVaR <- rugarch::quantile(sstdgarchroll, probs = 0.05)
# Compute the coverage
actual <- xts(rugarch::as.data.frame(normgarchroll)$Realized, time(normgarchVaR))
mean(actual < normgarchVaR)
mean(actual < sstdgarchVaR)
garchspec <- rugarch::ugarchspec(mean.model = list(armaOrder = c(1,0)),
variance.model = list(model = "gjrGARCH"),
distribution.model = "sstd"
)
# Estimate the model
garchfit <- rugarch::ugarchfit(data = sp500ret["/2006-12"], spec = garchspec)
# Fix the parameters
progarchspec <- garchspec
rugarch::setfixed(progarchspec) <- as.list(rugarch::coef(garchfit))
# Use ugarchfilter to obtain the estimated volatility for the complete period
garchfilter <- rugarch::ugarchfilter(data = sp500ret, spec = progarchspec)
plot(rugarch::sigma(garchfilter))
# Compare the 252 days ahead forecasts made at the end of September 2008 and September 2017
garchforecast2008 <- rugarch::ugarchforecast(data = sp500ret["/2008-09"],
fitORspec = progarchspec, n.ahead = 252
)
garchforecast2017 <- rugarch::ugarchforecast(data = sp500ret["/2017-09"],
fitORspec = progarchspec, n.ahead = 252
)
par(mfrow = c(2, 1), mar = c(3, 2, 3, 2))
plot(rugarch::sigma(garchforecast2008), main = "/2008-09", type = "l")
plot(rugarch::sigma(garchforecast2017), main = "/2017-09", type = "l")
par(mfrow = c(1, 1), mar = c(5.1, 4.1, 4.1, 2.1))
simgarchspec <- garchspec
rugarch::setfixed(simgarchspec) <- as.list(rugarch::coef(garchfit))
# Complete the code to simulate 4 time series of 10 years of daily returns
simgarch <- rugarch::ugarchpath(spec=simgarchspec, m.sim = 4, n.sim = 10*252, rseed = 210)
# Plot the simulated returns of the four series
simret <- rugarch::fitted(simgarch)
plot.zoo(simret)
plot.zoo(rugarch::sigma(simgarch))
# Compute the corresponding simulated prices and plot them
simprices <- exp(apply(simret, 2, "cumsum"))
matplot(simprices, type = "l", lwd = 3)
# Specify model with constant mean, standard GARCH and student t
garchspec <- rugarch::ugarchspec(mean.model = list(armaOrder = c(0, 0)),
variance.model = list(model = "sGARCH", variance.targeting = FALSE),
distribution.model = "std"
)
# Estimate using default starting values
garchfit <- rugarch::ugarchfit(spec=garchspec, data=EURUSDret)
# Print the estimated parameters and the likelihood
rugarch::coef(garchfit)
rugarch::likelihood(garchfit)
# Set other starting values and re-estimate
rugarch::setstart(garchspec) <- list(alpha1 = 0.05, beta1 = 0.9, shape = 6)
garchfit <- rugarch::ugarchfit(spec=garchspec, data=EURUSDret)
# Print the estimated parameters and the likelihood
rugarch::coef(garchfit)
rugarch::likelihood(garchfit)
garchspec <- rugarch::ugarchspec(mean.model = list(armaOrder = c(1,0)),
variance.model = list(model = "gjrGARCH"),
distribution.model = "sstd"
)
usgarchfit <- rugarch::ugarchfit(spec=garchspec, data=ret["2009/2017"])
eugarchfit <- rugarch::ugarchfit(spec=garchspec, data=msftret["2009/2017"])
# Compute the standardized US and EU returns, together with their correlation
stdusret <- rugarch::residuals(usgarchfit, standardize = TRUE)
stdeuret <- rugarch::residuals(eugarchfit, standardize = TRUE)
useucor <- as.numeric(cor(stdusret, stdeuret))
print(useucor)
# Compute the covariance and variance of the US and EU returns
useucov <- useucor * rugarch::sigma(usgarchfit) * rugarch::sigma(eugarchfit)
usvar <- rugarch::sigma(usgarchfit)**2
euvar <- rugarch::sigma(eugarchfit)**2
# Compute the minimum variance weight of the US ETF in the US-EU ETF portfolio
usweight <- (euvar - useucov) / (usvar + euvar - 2*useucov)
plot(usweight)
# Compute standardized returns
# stdmsftret <- residuals(msftgarchfit, standardize=TRUE)
# stdwmtret <- residuals(wmtgarchfit, standardize=TRUE)
# Print the correlation
# cor(stdmsftret, stdwmtret)
# Load the package PerformanceAnalytics
# library(PerformanceAnalytics)
# Plot the 3-month rolling correlation
# chart.RollingCorrelation(stdmsftret, stdwmtret, width = 66, main = "3-month rolling correlation between MSFT and WMT daily returns")
Chapter 1 - Introduction to RNA-Seq Theory and Workflow
Introduction to RNA-Seq:
RNA-Seq Workflow:
Differential Gene Expression Theory:
Example code includes:
# Load library for DESeq2
library(DESeq2)
# Load library for RColorBrewer
library(RColorBrewer)
# Load library for pheatmap
library(pheatmap)
# Load library for tidyverse
library(tidyverse)
# Explore the first six observations of smoc2_rawcounts
head(smoc2_rawcounts)
# Explore the structure of smoc2_rawcounts
str(smoc2_rawcounts)
# Create genotype vector
genotype <- rep("smoc2_oe", 7)
# Create condition vector
condition <- c(rep("fibrosis", 4), rep("normal", 3))
# Create data frame
smoc2_metadata <- data.frame(genotype, condition)
# Assign the row names of the data frame
rownames(smoc2_metadata) <- paste0("smoc2_", condition, c(1:4, 1, 3, 4))
Chapter 2 - Exploratory Data Analysis
Introduction to Differential Expression Analysis:
Organizing the data for DESeq2:
Count Normalization:
Hierarchical Heatmap:
Principal Component Analysis:
Example code includes:
# Use the match() function to reorder the columns of the raw counts
match(rownames(smoc2_metadata), colnames(smoc2_rawcounts))
# Reorder the columns of the count data
reordered_smoc2_rawcounts <- smoc2_rawcounts[, match(rownames(smoc2_metadata), colnames(smoc2_rawcounts))]
# Create a DESeq2 object
dds_smoc2 <- DESeqDataSetFromMatrix(countData = reordered_smoc2_rawcounts,
colData = smoc2_metadata,
design = ~ condition)
# Determine the size factors to use for normalization
dds_smoc2 <- estimateSizeFactors(dds_smoc2)
# Extract the normalized counts
smoc2_normalized_counts <- counts(dds_smoc2, normalized=TRUE)
# Transform the normalized counts
vsd_smoc2 <- vst(dds_smoc2, blind=TRUE)
# Extract the matrix of transformed counts
vsd_mat_smoc2 <- assay(vsd_smoc2)
# Compute the correlation values between samples
vsd_cor_smoc2 <- cor(vsd_mat_smoc2)
# Plot the heatmap
pheatmap(vsd_cor_smoc2, annotation = select(smoc2_metadata, condition))
# Transform the normalized counts
vsd_smoc2 <- vst(dds_smoc2, blind = TRUE)
# Plot the PCA of PC1 and PC2
plotPCA(vsd_smoc2, intgroup="condition")
Chapter 3 - Differential Expression Analysis with DESeq2
DE Analysis:
DESeq2 Model - Dispersion:
DESeq2 Model - Contrasts:
DESeq2 Results:
Eample code includes:
# Create DESeq2 object
dds_smoc2 <- DESeqDataSetFromMatrix(countData = reordered_smoc2_rawcounts, colData = smoc2_metadata, design = ~ condition)
# Run the DESeq2 analysis
dds_smoc2 <- DESeq(dds_smoc2)
# Plot dispersions
plotDispEsts(dds_smoc2)
# Extract the results of the differential expression analysis
smoc2_res <- results(dds_smoc2,
contrast = c("condition", "fibrosis", "normal"),
alpha = 0.05)
# Shrink the log2 fold change estimates to be more accurate
smoc2_res <- lfcShrink(dds_smoc2,
contrast = c("condition", "fibrosis", "normal"),
res = smoc2_res)
# Explore the results() function
?results
# Extract results
smoc2_res <- results(dds_smoc2,
contrast = c("condition", "fibrosis", "normal"),
alpha = 0.05,
lfcThreshold = 0.32)
# Shrink the log2 fold changes
smoc2_res <- lfcShrink(dds_smoc2,
contrast = c("condition", "fibrosis", "normal"),
res = smoc2_res)
# Get an overview of the results
summary(smoc2_res)
# Save results as a data frame
smoc2_res_all <- data.frame(smoc2_res)
# Subset the results to only return the significant genes with p-adjusted values less than 0.05
smoc2_res_sig <- subset(smoc2_res_all, padj < 0.05)
Chapter 4 - Exploration of Differential Expression Results
Visualization of Results:
RNA-Seq DE Analysis Setup:
RNA-Seq DE Analysis Summary:
RNA-Seq Next Steps:
Example code includes:
# Create MA plot
plotMA(smoc2_res)
# Generate logical column
smoc2_res_all <- data.frame(smoc2_res) %>% mutate(threshold = padj < 0.05)
# Create the volcano plot
ggplot(smoc2_res_all) +
geom_point(aes(x = log2FoldChange, y = -log10(padj), color = threshold)) +
xlab("log2 fold change") +
ylab("-log10 adjusted p-value") +
theme(legend.position = "none",
plot.title = element_text(size = rel(1.5), hjust = 0.5),
axis.title = element_text(size = rel(1.25)))
# Subset normalized counts to significant genes
sig_norm_counts_smoc2 <- normalized_counts_smoc2[rownames(smoc2_res_sig), ]
# Choose heatmap color palette
heat_colors <- brewer.pal(n = 6, name = "YlOrRd")
# Plot heatmap
pheatmap(sig_norm_counts_smoc2,
color = heat_colors,
cluster_rows = TRUE,
show_rownames = FALSE,
annotation = select(smoc2_metadata, condition),
scale = "row")
# Check that all of the samples are in the same order in the metadata and count data
all(colnames(all_rawcounts) %in% rownames(all_metadata))
# DESeq object to test for the effect of fibrosis regardless of genotype
dds_all <- DESeqDataSetFromMatrix(countData = all_rawcounts,
colData = all_metadata,
design = ~ genotype + condition)
# DESeq object to test for the effect of genotype on the effect of fibrosis
dds_complex <- DESeqDataSetFromMatrix(countData = all_rawcounts,
colData = all_metadata,
design = ~ genotype + condition + genotype:condition)
# Log transform counts for QC
vsd_all <- vst(dds_all, blind = TRUE)
# Create heatmap of sample correlation values
vsd_all %>%
assay() %>%
cor() %>%
pheatmap(annotation = select(all_metadata, c("condition", "genotype")))
# Create the PCA plot for PC1 and PC2 and color by condition
plotPCA(vsd_all, intgroup="condition")
# Create the PCA plot for PC1 and PC2 and color by genotype
plotPCA(vsd_all, intgroup="genotype")
# Select significant genese with padj < 0.05
smoc2_sig <- subset(res_all, padj < 0.05) %>%
data.frame() %>%
rownames_to_column(var = "geneID")
# Extract the top 6 genes with padj values
smoc2_sig %>%
arrange(padj) %>%
select(geneID, padj) %>%
head()
Chapter 1 - What is Survival Analysis?
The term “survival analysis”:
Why learn survival methods?
Measures used in survival analysis:
Example code includes:
# Check out the help page for this dataset
# help(GBSG2, package = "TH.data")
# Load the data
data(GBSG2, package = "TH.data")
# Look at the summary of the dataset
summary(GBSG2)
## horTh age menostat tsize tgrade
## no :440 Min. :21.00 Pre :290 Min. : 3.00 I : 81
## yes:246 1st Qu.:46.00 Post:396 1st Qu.: 20.00 II :444
## Median :53.00 Median : 25.00 III:161
## Mean :53.05 Mean : 29.33
## 3rd Qu.:61.00 3rd Qu.: 35.00
## Max. :80.00 Max. :120.00
## pnodes progrec estrec time
## Min. : 1.00 Min. : 0.0 Min. : 0.00 Min. : 8.0
## 1st Qu.: 1.00 1st Qu.: 7.0 1st Qu.: 8.00 1st Qu.: 567.8
## Median : 3.00 Median : 32.5 Median : 36.00 Median :1084.0
## Mean : 5.01 Mean : 110.0 Mean : 96.25 Mean :1124.5
## 3rd Qu.: 7.00 3rd Qu.: 131.8 3rd Qu.: 114.00 3rd Qu.:1684.8
## Max. :51.00 Max. :2380.0 Max. :1144.00 Max. :2659.0
## cens
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4359
## 3rd Qu.:1.0000
## Max. :1.0000
# Count censored and uncensored data
num_cens <- table(GBSG2$cens)
num_cens
##
## 0 1
## 387 299
# Create barplot of censored and uncensored data
barplot(table(GBSG2$cens))
# Create Surv-Object
sobj <- survival::Surv(GBSG2$time, GBSG2$cens)
# Look at 10 first elements
sobj[1:10]
## [1] 1814 2018 712 1807 772 448 2172+ 2161+ 471 2014+
# Look at summary
summary(sobj)
## time status
## Min. : 8.0 Min. :0.0000
## 1st Qu.: 567.8 1st Qu.:0.0000
## Median :1084.0 Median :0.0000
## Mean :1124.5 Mean :0.4359
## 3rd Qu.:1684.8 3rd Qu.:1.0000
## Max. :2659.0 Max. :1.0000
# Look at structure
str(sobj)
## 'Surv' num [1:686, 1:2] 1814 2018 712 1807 772 448 2172+ 2161+ 471 2014+ ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:2] "time" "status"
## - attr(*, "type")= chr "right"
Chapter 2 - Estimation of Survival Curves
Kaplan-Meier Estimate:
Understanding and Visualizing Kaplan-Meier Curves:
Weibull Model for Estimating Survival Curves:
Visualizing Results of Weibull Model:
Example code includes:
# Create time and event data
time <- c(5, 6, 2, 4, 4)
event <- c(1, 0, 0, 1, 1)
# Compute Kaplan-Meier estimate
km <- survival::survfit(survival::Surv(time, event) ~ 1)
km
## Call: survfit(formula = survival::Surv(time, event) ~ 1)
##
## n events median 0.95LCL 0.95UCL
## 5.0 3.0 4.5 4.0 NA
# Take a look at the structure
str(km)
## List of 16
## $ n : int 5
## $ time : num [1:4] 2 4 5 6
## $ n.risk : num [1:4] 5 4 2 1
## $ n.event : num [1:4] 0 2 1 0
## $ n.censor : num [1:4] 1 0 0 1
## $ surv : num [1:4] 1 0.5 0.25 0.25
## $ std.err : num [1:4] 0 0.5 0.866 0.866
## $ cumhaz : num [1:4] 0 0.5 1 1
## $ std.chaz : num [1:4] 0 0.354 0.612 0.612
## $ type : chr "right"
## $ logse : logi TRUE
## $ conf.int : num 0.95
## $ conf.type: chr "log"
## $ lower : num [1:4] 1 0.1877 0.0458 0.0458
## $ upper : num [1:4] 1 1 1 1
## $ call : language survfit(formula = survival::Surv(time, event) ~ 1)
## - attr(*, "class")= chr "survfit"
# Create data.frame
data.frame(time = km$time, n.risk = km$n.risk, n.event = km$n.event,
n.censor = km$n.censor, surv = km$surv
)
## time n.risk n.event n.censor surv
## 1 2 5 0 1 1.00
## 2 4 4 2 0 0.50
## 3 5 2 1 0 0.25
## 4 6 1 0 1 0.25
# Create dancedat data
dancedat <- data.frame(name = c("Chris", "Martin", "Conny", "Desi", "Reni", "Phil", "Flo", "Andrea", "Isaac", "Dayra", "Caspar"),
time = c(20, 2, 14, 22, 3, 7, 4, 15, 25, 17, 12),
obs_end = c(1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0)
)
# Estimate the survivor function pretending that all censored observations are actual observations.
km_wrong <- survival::survfit(survival::Surv(time) ~ 1, data = dancedat)
# Estimate the survivor function from this dataset via kaplan-meier.
km <- survival::survfit(survival::Surv(time, obs_end) ~ 1, data = dancedat)
# Plot the two and compare
survminer::ggsurvplot_combine(list(correct = km, wrong = km_wrong))
# Kaplan-Meier estimate
km <- survival::survfit(survival::Surv(time, cens) ~ 1, data = GBSG2)
# plot of the Kaplan-Meier estimate
survminer::ggsurvplot(km)
# add the risk table to plot
survminer::ggsurvplot(km, risk.table = TRUE)
# add a line showing the median survival time
survminer::ggsurvplot(km, risk.table = TRUE, surv.median.line = "hv")
# Weibull model
wb <- survival::survreg(survival::Surv(time, cens) ~ 1, data = GBSG2)
# Compute the median survival from the model
predict(wb, type = "quantile", p = 0.5, newdata = data.frame(1))
## 1
## 1693.93
# 70 Percent of patients survive beyond time point...
predict(wb, type = "quantile", p = 1-0.7, newdata = data.frame(1))
## 1
## 1004.524
# Retrieve survival curve from model probabilities
surv <- seq(.99, .01, by = -.01)
# Get time for each probability
t <- predict(wb, type = "quantile", p = 1 - surv, newdata = data.frame(1))
# Create data frame with the information
surv_wb <- data.frame(time = t, surv = surv)
# Look at first few lines of the result
head(surv_wb)
## time surv
## 1 60.6560 0.99
## 2 105.0392 0.98
## 3 145.0723 0.97
## 4 182.6430 0.96
## 5 218.5715 0.95
## 6 253.3125 0.94
# Create data frame with the information needed for ggsurvplot_df
surv_wb <- data.frame(time = t, surv = surv, upper = NA, lower = NA, std.err = NA)
# Plot
survminer::ggsurvplot_df(fit = surv_wb, surv.geom = geom_line)
Chapter 3 - Weibull Model
Why Use the Weibull Model?
Visualizing Weibull Models:
Other Distributions:
Example code includes:
dfTime <- c(306, 455, 1010, 210, 883, 1022, 310, 361, 218, 166, 170, 654, 728, 71, 567, 144, 613, 707, 61, 88, 301, 81, 624, 371, 394, 520, 574, 118, 390, 12, 473, 26, 533, 107, 53, 122, 814, 965, 93, 731, 460, 153, 433, 145, 583, 95, 303, 519, 643, 765, 735, 189, 53, 246, 689, 65, 5, 132, 687, 345, 444, 223, 175, 60, 163, 65, 208, 821, 428, 230, 840, 305, 11, 132, 226, 426, 705, 363, 11, 176, 791, 95, 196, 167, 806, 284, 641, 147, 740, 163, 655, 239, 88, 245, 588, 30, 179, 310, 477, 166, 559, 450, 364, 107, 177, 156, 529, 11, 429, 351, 15, 181, 283, 201, 524, 13, 212, 524, 288, 363, 442, 199, 550, 54, 558, 207, 92, 60, 551, 543, 293, 202, 353, 511, 267, 511, 371, 387, 457, 337, 201, 404, 222, 62, 458, 356, 353, 163, 31, 340, 229, 444, 315, 182, 156, 329, 364, 291, 179, 376, 384, 268, 292, 142, 413, 266, 194, 320, 181, 285, 301, 348, 197, 382, 303, 296, 180, 186, 145, 269, 300, 284, 350, 272, 292, 332, 285, 259, 110, 286, 270, 81, 131, 225, 269, 225, 243, 279, 276, 135, 79, 59, 240, 202, 235, 105, 224, 239, 237, 173, 252, 221, 185, 92, 13, 222, 192, 183, 211, 175, 197, 203, 116, 188, 191, 105, 174, 177)
dfStatus <- c(2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1)
dfSex <- factor(ifelse(c(1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1, 1, 2, 1, 2)==1, "male", "female"), levels=c("male", "female"))
dat <- data.frame(time=dfTime, status=dfStatus, sex=dfSex)
# Look at the data set
str(dat)
## 'data.frame': 228 obs. of 3 variables:
## $ time : num 306 455 1010 210 883 ...
## $ status: num 2 2 1 2 2 1 2 2 2 2 ...
## $ sex : Factor w/ 2 levels "male","female": 1 1 1 1 1 1 2 2 1 1 ...
# Estimate a Weibull model
wbmod <- survival::survreg(survival::Surv(time, status) ~ sex, data = dat)
coef(wbmod)
## (Intercept) sexfemale
## 5.884162 0.395578
# Weibull model
wbmod <- survival::survreg(survival::Surv(time, cens) ~ horTh, data = GBSG2)
# Retrieve survival curve from model
surv <- seq(.99, .01, by = -.01)
t_yes <- predict(wbmod, type = "quantile", p = 1-surv, newdata = data.frame(horTh = "yes"))
# Take a look at survival curve
str(t_yes)
## num [1:99] 76.4 131.4 180.9 227.2 271.4 ...
# Weibull model
wbmod <- survival::survreg(survival::Surv(time, cens) ~ horTh + tsize, data = GBSG2)
# Imaginary patients
newdat <- expand.grid(horTh = levels(GBSG2$horTh),
tsize = quantile(GBSG2$tsize, probs = c(0.25, 0.5, 0.75))
)
# Compute survival curves
surv <- seq(.99, .01, by = -.01)
t <- predict(wbmod, type = "quantile", p = 1-surv, newdata = newdat)
# How many rows and columns does t have?
dim(t)
## [1] 6 99
# Use cbind() to combine the information in newdat with t
surv_wbmod_wide <- cbind(newdat, t)
# Use melt() to bring the data.frame to long format
surv_wbmod <- reshape2::melt(surv_wbmod_wide, id.vars = c("horTh", "tsize"),
variable.name = "surv_id", value.name = "time"
)
# Use surv_wbmod$surv_id to add the correct survival probabilities surv
surv_wbmod$surv <- surv[as.numeric(surv_wbmod$surv_id)]
# Add columns upper, lower, std.err, and strata to the data.frame
surv_wbmod[, c("upper", "lower", "std.err", "strata")] <- NA
# Take a look at the structure of the object
str(surv_wbmod)
## 'data.frame': 594 obs. of 9 variables:
## $ horTh : Factor w/ 2 levels "no","yes": 1 2 1 2 1 2 1 2 1 2 ...
## $ tsize : num 20 20 25 25 35 35 20 20 25 25 ...
## $ surv_id: Factor w/ 99 levels "1","2","3","4",..: 1 1 1 1 1 1 2 2 2 2 ...
## $ time : num 65.9 90 62 84.6 54.9 ...
## $ surv : num 0.99 0.99 0.99 0.99 0.99 0.99 0.98 0.98 0.98 0.98 ...
## $ upper : logi NA NA NA NA NA NA ...
## $ lower : logi NA NA NA NA NA NA ...
## $ std.err: logi NA NA NA NA NA NA ...
## $ strata : logi NA NA NA NA NA NA ...
# Plot the survival curves
survminer::ggsurvplot_df(surv_wbmod, surv.geom = geom_line,
linetype = "horTh", color = "tsize", legend.title = NULL
)
# Weibull model
wbmod <- survival::survreg(survival::Surv(time, cens) ~ horTh, data = GBSG2)
# Log-Normal model
lnmod <- survival::survreg(survival::Surv(time, cens) ~ horTh, data = GBSG2, dist = "lognormal")
# Newdata
newdat <- data.frame(horTh = levels(GBSG2$horTh))
# Surv
surv <- seq(0.99, .01, by = -.01)
# Survival curve from Weibull model and log-normal model
wbt <- predict(wbmod, type = "quantile", p = 1-surv, newdata = newdat)
lnt <- predict(lnmod, type = "quantile", p = 1-surv, newdata = newdat)
dfWbtLnt <- as.data.frame(rbind(wbt, lnt))
names(dfWbtLnt) <- as.character(1:99)
surv_wide <- cbind(data.frame(horTh=factor(c("no", "yes", "no", "yes"), levels=c("no", "yes"))),
dfWbtLnt,
data.frame(dist=factor(c("weibull", "weibull", "lognormal", "lognormal")))
)
# Melt the data.frame into long format.
surv_long <- reshape2::melt(surv_wide, id.vars = c("horTh", "dist"),
variable.name = "surv_id",
value.name = "time"
)
# Add column for the survival probabilities
surv_long$surv <- surv[as.numeric(surv_long$surv_id)]
# Add columns upper, lower, std.err, and strata contianing NA values
surv_long[, c("upper", "lower", "std.err", "strata")] <- NA
# Plot the survival curves
survminer::ggsurvplot_df(surv_long, surv.geom = geom_line,
linetype = "horTh", color = "dist", legend.title = NULL
)
Chapter 4 - Cox Model
Cox Model - most widely used model in survival analysis:
Visualizing the Cox Model:
Recap:
Wrap Up:
Example code includes:
dat$performance <- c(90, 90, 90, 90, 100, 50, 70, 60, 70, 70, 80, 70, 90, 60, 80, 80, 90, 50, 60, 90, 80, 100, 70, 90, 90, 90, 100, 60, 80, 70, 90, 60, 60, 50, 70, 50, 70, 70, 50, 80, 80, 60, 90, 70, 60, 60, 90, 80, 90, 90, 90, 80, 90, 100, 90, 90, 100, 70, 80, 90, 70, 90, 80, 90, 80, 70, 70, 90, 100, 80, 90, 80, 70, 80, 90, 90, 100, 80, 90, 90, 100, 70, 80, 80, 80, 80, 80, 100, 90, 70, 100, 80, 90, 80, 100, 80, 80, 90, 90, 90, 100, 80, 70, 90, 50, 80, 80, 90, 100, 60, 90, 80, 80, 90, 80, 70, 70, 60, 70, 80, 90, 70, 70, 60, 90, 80, 80, 80, 80, 90, 80, 80, 100, 80, 90, 60, 80, 80, 90, 100, 70, 80, 70, 80, 80, 90, 100, 90, 100, 100, 70, 90, 90, 80, 70, 70, 90, 70, 80, 80, 90, 90, 60, 90, 80, 90, 80, 100, 90, 100, 90, 90, 90, 100, 90, 80, 60, 80, 80, 100, 100, 100, 90, 80, 90, 90, 70, 90, 80, 90, 80, 60, 90, 90, 90, 100, 80, 90, 100, 90, 90, 60, 90, 100, 100, NA, 80, 60, 80, 90, 100, 80, 90, 70, 80, 90, 90, 80, 70, 80, 80, 80, 80, 80, 90, 60, 90, 80)
str(dat)
## 'data.frame': 228 obs. of 4 variables:
## $ time : num 306 455 1010 210 883 ...
## $ status : num 2 2 1 2 2 1 2 2 2 2 ...
## $ sex : Factor w/ 2 levels "male","female": 1 1 1 1 1 1 2 2 1 1 ...
## $ performance: num 90 90 90 90 100 50 70 60 70 70 ...
# Compute Cox model
cxmod <- survival::coxph(survival::Surv(time, status) ~ performance, data = dat)
# Show model coefficient
coef(cxmod)
## performance
## -0.01644821
# Cox model
cxmod <- survival::coxph(survival::Surv(time, cens) ~ horTh + tsize, data = GBSG2)
# Imaginary patients
newdat <- expand.grid(horTh = levels(GBSG2$horTh),
tsize = quantile(GBSG2$tsize, probs = c(0.25, 0.5, 0.75))
)
rownames(newdat) <- letters[1:6]
# Compute survival curves
cxsf <- survival::survfit(cxmod, data = GBSG2, newdata = newdat, conf.type = "none")
# Look at first 6 rows of cxsf$surv and time points
head(cxsf$surv)
## a b c d e f
## [1,] 1 1 1 1 1 1
## [2,] 1 1 1 1 1 1
## [3,] 1 1 1 1 1 1
## [4,] 1 1 1 1 1 1
## [5,] 1 1 1 1 1 1
## [6,] 1 1 1 1 1 1
head(cxsf$time)
## [1] 8 15 16 17 18 29
# Remove conf.type="none" per https://github.com/kassambara/survminer/issues/355
cxsf <- survival::survfit(cxmod, data = GBSG2, newdata = newdat)
# Compute data.frame needed for plotting
surv_cxmod0 <- survminer::surv_summary(cxsf)
# Get a character vector of patient letters (patient IDs)
pid <- as.character(surv_cxmod0$strata)
# Multiple of the rows in newdat so that it fits with surv_cxmod0
m_newdat <- newdat[pid, ]
# Add patient info to data.frame
surv_cxmod <- cbind(surv_cxmod0, m_newdat)
# Plot
survminer::ggsurvplot_df(surv_cxmod, linetype = "horTh", color = "tsize",
legend.title = NULL, censor = FALSE
)
# Compute Cox model and survival curves
cxmod <- survival::coxph(survival::Surv(time, status) ~ performance, data = dat)
new_lung <- data.frame(performance = c(60, 70, 80, 90))
cxsf <- survival::survfit(cxmod, data = dat, newdata = new_lung)
# Use the summary of cxsf to take a vector of patient IDs
surv_cxmod0 <- survminer::surv_summary(cxsf)
pid <- as.character(surv_cxmod0$strata)
# Duplicate rows in newdat to fit with surv_cxmod0 and add them in
m_newdat <- new_lung[pid, , drop = FALSE]
surv_cxmod <- cbind(surv_cxmod0, m_newdat)
# Plot
survminer::ggsurvplot_df(surv_cxmod, color = "performance", legend.title = NULL, censor = FALSE)
# Compute Kaplan-Meier curve
km <- survival::survfit(survival::Surv(time, status) ~ 1, data = dat)
# Compute Cox model
cxmod <- survival::coxph(survival::Surv(time, status) ~ performance, data = dat)
# Compute Cox model survival curves
new_lung <- data.frame(performance = c(60, 70, 80, 90))
cxsf <- survival::survfit(cxmod, data = dat, newdata = new_lung)
# Plot Kaplan-Meier curve
survminer::ggsurvplot(km, conf.int = FALSE)
# Plot Cox model survival curves
# survminer::ggsurvplot(cxsf, censor = FALSE)
Chapter 1 - Response Models for Product Sales
Fundamentals of Market Response Models:
Linear Response Models:
Non-Linear Response Models:
Example code includes:
load("./RInputFiles/sales.data.RData")
load("./RInputFiles/choice.data.RData")
str(sales.data)
## 'data.frame': 124 obs. of 6 variables:
## $ OBS : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SALES : num 61.8 11.5 61.6 38.3 31.7 ...
## $ PRICE : num 1.09 1.27 1.27 1.27 1.27 ...
## $ DISPLAY : int 0 0 0 0 0 0 0 0 0 0 ...
## $ COUPON : int 0 0 0 0 0 0 0 0 0 1 ...
## $ DISPLAYCOUPON: int 0 0 0 0 0 0 0 0 0 0 ...
str(choice.data)
## 'data.frame': 2798 obs. of 13 variables:
## $ OBS : int 1 2 3 4 5 6 7 8 9 10 ...
## $ HOUSEHOLDID : int 1 1 1 1 1 1 1 1 1 1 ...
## $ LASTPURCHASE : int 0 0 0 0 0 0 0 0 0 0 ...
## $ BUD : num 1 1 1 1 1 1 1 1 1 1 ...
## $ HOPPINESS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PRICE.BUD : num 0.052 0.052 0.046 0.052 0.046 ...
## $ PRICE.HOP : num 0.034 0.044 0.048 0.034 0.048 ...
## $ DISPL.BUD : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DISPL.HOP : int 0 0 0 0 0 0 0 0 0 1 ...
## $ FEAT.BUD : int 0 0 1 0 1 0 0 0 0 0 ...
## $ FEAT.HOP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FEATDISPL.BUD: int 0 0 0 0 0 0 0 0 0 0 ...
## $ FEATDISPL.HOP: int 0 0 0 0 0 0 1 0 0 0 ...
# Tail of sales.data
tail(sales.data)
## OBS SALES PRICE DISPLAY COUPON DISPLAYCOUPON
## 119 119 11.82116 0.987500 0 0 0
## 120 120 14.36825 0.987500 0 0 0
## 121 121 34.71571 0.987500 0 0 0
## 122 122 52.61871 0.987500 0 0 0
## 123 123 52.41674 1.026786 0 0 0
## 124 124 49.35723 1.050000 0 0 0
# Mean SALES
mean(sales.data$SALES)
## [1] 118.8262
# Minimum SALES
min(sales.data$SALES)
## [1] 11.4692
# Maximum SALES
max(sales.data$SALES)
## [1] 1406.698
# Linear model explaining SALES by PRICE
linear.model <- lm(SALES ~ PRICE, data = sales.data)
# Obtain the model coefficients
coef(linear.model)
## (Intercept) PRICE
## 274.2486 -134.3097
# Obtain the intercept coefficient
coef(linear.model)[1]
## (Intercept)
## 274.2486
# Obtain the slope coefficient
coef(linear.model)[2]
## PRICE
## -134.3097
# Predict the SALES for the decreased PRICE of 1.05
coef(linear.model)[1] + 1.05 * coef(linear.model)[2]
## (Intercept)
## 133.2234
# Predict the SALES for the decreased PRICE of 0.95
coef(linear.model)[1] + 0.95 * coef(linear.model)[2]
## (Intercept)
## 146.6544
# Linear model explaining SALES by PRICE
linear.model <- lm(SALES ~ PRICE, data = sales.data)
# Plot SALES against PRICE
plot(SALES ~ PRICE, data = sales.data)
# Adding the model predictions
abline(coef(linear.model))
# Linear model explaining log(SALES) by PRICE
log.model <- lm(log(SALES) ~ PRICE, data=sales.data)
# Obtaining the model coefficients
coef(log.model)
## (Intercept) PRICE
## 5.0843983 -0.6622516
# Plot log(SALES) against PRICE
plot(log(SALES) ~ PRICE, data=sales.data)
# Linear model explaining log(SALES) by PRICE
log.model <- lm(log(SALES) ~ PRICE, data=sales.data)
# Adding the model predictions
abline(coef(log.model))
Chapter 2 - Extended Sales Response Modeling
Model Extension Part 1: Dummy Variables:
Model Extensions Part 2: Dynamic Variables:
Number of Extensions Needed:
Example code includes:
# Proportion of DISPLAY and no-DISPLAY activity
table(sales.data$DISPLAY) / sum(table(sales.data$DISPLAY))
##
## 0 1
## 0.733871 0.266129
# Mean of DISPLAY activity
mean(sales.data$DISPLAY)
## [1] 0.266129
# Mean of no-DISPLAY activity
1 - mean(sales.data$DISPLAY)
## [1] 0.733871
# Linear model explaining log(SALES) by DISPLAY
dummy.model <- lm(log(SALES) ~ DISPLAY, data = sales.data)
# Obtaining the coefficients
coef(dummy.model)
## (Intercept) DISPLAY
## 4.1949532 0.4625243
# Mean DISPLAY activity
mean(sales.data$DISPLAY)
## [1] 0.266129
# Mean COUPON activity
mean(sales.data$COUPON)
## [1] 0.09677419
# Mean DISPLAY and COUPON activity
mean(sales.data$DISPLAYCOUPON)
## [1] 0.05645161
# Summarize DISPLAY, COUPON, DISPLAYCOUPON activity
summary(sales.data[,c("DISPLAY", "COUPON", "DISPLAYCOUPON")])
## DISPLAY COUPON DISPLAYCOUPON
## Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.2661 Mean :0.09677 Mean :0.05645
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.00000
# Linear model explaining log(SALES) by DISPLAY, COUPON and DISPLAYCOUPON
dummy.model <- lm(log(SALES) ~ DISPLAY + COUPON + DISPLAYCOUPON, data = sales.data)
# Obtain the model coefficients
coef(dummy.model)
## (Intercept) DISPLAY COUPON DISPLAYCOUPON
## 3.7975707 0.8599068 1.7597567 2.1492468
# Dummy.mod updated for PRICE
update(dummy.model, . ~ . + PRICE)
##
## Call:
## lm(formula = log(SALES) ~ DISPLAY + COUPON + DISPLAYCOUPON +
## PRICE, data = sales.data)
##
## Coefficients:
## (Intercept) DISPLAY COUPON DISPLAYCOUPON PRICE
## 3.4310 0.8747 1.7646 2.1630 0.3123
# Compare lagged PRICE and original PRICE
head(cbind(sales.data$PRICE, lag(sales.data$PRICE)))
## [,1] [,2]
## [1,] 1.090000 NA
## [2,] 1.271818 1.090000
## [3,] 1.271818 1.271818
## [4,] 1.271818 1.271818
## [5,] 1.271818 1.271818
## [6,] 1.271818 1.271818
# Create the lagged PRICE variable
Price.lag <- lag(sales.data$PRICE)
# Linear model explaining log(SALES) by PRICE and Price.lag
lag.model <- lm(log(SALES) ~ PRICE + Price.lag, data = sales.data)
# Obtain the coefficients
coef(lag.model)
## (Intercept) PRICE Price.lag
## 3.905902 -4.578985 4.934948
# Create the lagged COUPON variable
Coupon.lag <- lag(sales.data$COUPON)
# Update lag.model for COUPON and C_lag
update(lag.model, . ~ . + COUPON + Coupon.lag)
##
## Call:
## lm(formula = log(SALES) ~ PRICE + Price.lag + COUPON + Coupon.lag,
## data = sales.data)
##
## Coefficients:
## (Intercept) PRICE Price.lag COUPON Coupon.lag
## 3.833 -4.505 4.843 1.354 -0.384
sales.data2 <- sales.data %>%
mutate(Price.lag = lag(PRICE, 1),
Display.lag = lag(DISPLAY, 1),
Coupon.lag = lag(COUPON, 1),
DisplayCoupon.lag = lag(DISPLAYCOUPON, 1)
)
# Extended sales resonse model
extended.model <- lm(log(SALES) ~ PRICE + Price.lag + DISPLAY + Display.lag + COUPON +
Coupon.lag + DISPLAYCOUPON + DisplayCoupon.lag, data = sales.data2
)
# Plot log(SALES) against OBS
plot(log(SALES) ~ OBS, data = sales.data2)
# Add the model predictions
lines(c(NA, fitted.values(extended.model)) ~ OBS, data = sales.data2)
# Summarize the model
summary(extended.model)
##
## Call:
## lm(formula = log(SALES) ~ PRICE + Price.lag + DISPLAY + Display.lag +
## COUPON + Coupon.lag + DISPLAYCOUPON + DisplayCoupon.lag,
## data = sales.data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.27541 -0.31110 0.01536 0.34871 0.92009
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.2561 0.5654 3.991 0.000117 ***
## PRICE -2.6857 0.7921 -3.390 0.000959 ***
## Price.lag 3.9920 0.7959 5.016 1.96e-06 ***
## DISPLAY 0.4570 0.1279 3.572 0.000521 ***
## Display.lag 0.5097 0.1180 4.319 3.36e-05 ***
## COUPON 1.7531 0.1576 11.121 < 2e-16 ***
## Coupon.lag -0.2098 0.1567 -1.339 0.183344
## DISPLAYCOUPON 2.0087 0.2017 9.960 < 2e-16 ***
## DisplayCoupon.lag 0.4489 0.2112 2.126 0.035695 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5 on 114 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.7135, Adjusted R-squared: 0.6934
## F-statistic: 35.5 on 8 and 114 DF, p-value: < 2.2e-16
# AIC of the extended response model
AIC(extended.model)
## [1] 189.21
# Single term deletion
AIC(lm(update(extended.model, . ~ . -Coupon.lag), data = sales.data2))
## [1] 189.1284
# Backward elemination
final.model <- MASS::stepAIC(extended.model, direction = "backward", trace = FALSE)
# Summarize the final model
summary(final.model)
##
## Call:
## lm(formula = log(SALES) ~ PRICE + Price.lag + DISPLAY + Display.lag +
## COUPON + DISPLAYCOUPON + DisplayCoupon.lag, data = sales.data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.24037 -0.31964 0.01535 0.35218 0.94686
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1887 0.5651 3.873 0.000179 ***
## PRICE -2.6888 0.7949 -3.383 0.000982 ***
## Price.lag 4.0267 0.7982 5.045 1.71e-06 ***
## DISPLAY 0.4524 0.1283 3.525 0.000609 ***
## Display.lag 0.5447 0.1155 4.717 6.78e-06 ***
## COUPON 1.7635 0.1580 11.161 < 2e-16 ***
## DISPLAYCOUPON 1.9954 0.2021 9.872 < 2e-16 ***
## DisplayCoupon.lag 0.4839 0.2103 2.301 0.023182 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5017 on 115 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.709, Adjusted R-squared: 0.6913
## F-statistic: 40.04 on 7 and 115 DF, p-value: < 2.2e-16
Chapter 3 - Response Models for Individual Demand
Models for Individual Demand:
Logistic Response Models:
Probit Response Models:
Example code includes:
# Structure of choice.data
str(choice.data)
## 'data.frame': 2798 obs. of 13 variables:
## $ OBS : int 1 2 3 4 5 6 7 8 9 10 ...
## $ HOUSEHOLDID : int 1 1 1 1 1 1 1 1 1 1 ...
## $ LASTPURCHASE : int 0 0 0 0 0 0 0 0 0 0 ...
## $ BUD : num 1 1 1 1 1 1 1 1 1 1 ...
## $ HOPPINESS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PRICE.BUD : num 0.052 0.052 0.046 0.052 0.046 ...
## $ PRICE.HOP : num 0.034 0.044 0.048 0.034 0.048 ...
## $ DISPL.BUD : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DISPL.HOP : int 0 0 0 0 0 0 0 0 0 1 ...
## $ FEAT.BUD : int 0 0 1 0 1 0 0 0 0 0 ...
## $ FEAT.HOP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FEATDISPL.BUD: int 0 0 0 0 0 0 0 0 0 0 ...
## $ FEATDISPL.HOP: int 0 0 0 0 0 0 1 0 0 0 ...
# Summarize purchases of HOPPINESS, BUD and PRICE.HOP and PRICE.BUD
summary(choice.data[,c("HOPPINESS", "BUD", "PRICE.HOP", "PRICE.BUD")])
## HOPPINESS BUD PRICE.HOP PRICE.BUD
## Min. :0.0000 Min. :0.0000 Min. :0.00300 Min. :0.00100
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.03100 1st Qu.:0.03000
## Median :0.0000 Median :1.0000 Median :0.03400 Median :0.03400
## Mean :0.1001 Mean :0.8999 Mean :0.03355 Mean :0.03483
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.03600 3rd Qu.:0.04200
## Max. :1.0000 Max. :1.0000 Max. :0.08700 Max. :0.06100
# Plot HOPPINESS against PRICE.HOP
plot(HOPPINESS ~ PRICE.HOP, data = choice.data)
# Linear probability model explaining HOPPINESS by PRICE.HOP
probability.model <- lm(HOPPINESS ~ PRICE.HOP, data = choice.data)
# Add the model predictions
abline(coef(probability.model))
# Calculate the price ratio for HOPPINESS and BUD
choice.data$price.ratio <- log(choice.data$PRICE.HOP / choice.data$PRICE.BUD)
# Plot HOPPINESS purchases against the price ratio
plot(HOPPINESS ~ price.ratio, data = choice.data)
# Linear probability model explaining HOPPINESS by price.ratio
probability.model <- lm(HOPPINESS ~ price.ratio, data = choice.data)
# Add the model predictions
abline(probability.model)
# Logistic model explaining HOPPINESS by price.ratio
logistic.model <- glm(HOPPINESS ~ price.ratio, family = binomial, data = choice.data)
# Obtain the coefficients
coef(logistic.model)
## (Intercept) price.ratio
## -3.572678 -6.738768
# Plot HOPPINESS choices against price.diff
plot(HOPPINESS ~ price.ratio, data = choice.data)
# Add the predictions of the logistic model
curve(predict(logistic.model, data.frame(price.ratio = x), type = "response"), add = TRUE)
# Linear probability model
coef(probability.model)
## (Intercept) price.ratio
## 0.09700236 -0.29594939
# Logistic model
margins::margins(logistic.model)
## Average marginal effects
## glm(formula = HOPPINESS ~ price.ratio, family = binomial, data = choice.data)
## price.ratio
## -0.4585
# Sequence of x values
x <- seq(-1, 1, by = 0.10)
# Conditional effect plot
margins::cplot(logistic.model, "price.ratio", xvals = x)
## xvals yvals upper lower
## 1 -1.0 0.9595380563 0.9814489158 9.376272e-01
## 2 -0.9 0.9235941387 0.9580857059 8.891026e-01
## 3 -0.8 0.8603664046 0.9101362487 8.105966e-01
## 4 -0.7 0.7584975129 0.8210699539 6.959251e-01
## 5 -0.6 0.6155217264 0.6799292789 5.511142e-01
## 6 -0.5 0.4493508612 0.5011249497 3.975768e-01
## 7 -0.4 0.2937644409 0.3271033760 2.604255e-01
## 8 -0.3 0.1749350431 0.1954437612 1.544263e-01
## 9 -0.2 0.0975345104 0.1119672207 8.310180e-02
## 10 -0.1 0.0522128673 0.0628159496 4.160979e-02
## 11 0.0 0.0273135795 0.0346981044 1.992905e-02
## 12 0.1 0.0141114813 0.0189496002 9.273362e-03
## 13 0.2 0.0072431374 0.0102678379 4.218437e-03
## 14 0.3 0.0037051921 0.0055333871 1.876997e-03
## 15 0.4 0.0018920795 0.0029702318 8.139273e-04
## 16 0.5 0.0009653426 0.0015895470 3.411382e-04
## 17 0.6 0.0004922958 0.0008485618 1.360297e-04
## 18 0.7 0.0002509978 0.0004520395 4.995612e-05
## 19 0.8 0.0001279565 0.0002403579 1.555513e-05
## 20 0.9 0.0000652272 0.0001275873 2.867069e-06
# Probit model explaining HOPPINESS by price.ratio
probit.model <- glm(HOPPINESS ~ price.ratio, family = binomial(link=probit), data = choice.data)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Obtain the coefficients
coef(probit.model)
## (Intercept) price.ratio
## -1.954092 -3.547546
# Compare the coefficients
cbind(coef(probit.model), coef(logistic.model))
## [,1] [,2]
## (Intercept) -1.954092 -3.572678
## price.ratio -3.547546 -6.738768
# Logistic model
margins::margins(logistic.model)
## Average marginal effects
## glm(formula = HOPPINESS ~ price.ratio, family = binomial, data = choice.data)
## price.ratio
## -0.4585
# Probit model
margins::margins(probit.model)
## Average marginal effects
## glm(formula = HOPPINESS ~ price.ratio, family = binomial(link = probit), data = choice.data)
## price.ratio
## -0.4503
Chapter 4 - Extended Demand Modeling
Model Selection:
Predictive Performance:
Model Validation:
Wrap Up:
Example code includes:
# Summarizing DISPLAY.HOP, FEAT.HOP, FEATDISPL.HOP actions
summary(choice.data[, c("DISPL.HOP", "FEAT.HOP", "FEATDISPL.HOP")])
## DISPL.HOP FEAT.HOP FEATDISPL.HOP
## Min. :0.00000 Min. :0.00000 Min. :0.000000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000
## Median :0.00000 Median :0.00000 Median :0.000000
## Mean :0.03538 Mean :0.03645 Mean :0.009292
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000
## Max. :1.00000 Max. :1.00000 Max. :1.000000
# Logistic model explaining HOPPINESS by price.diff, DISPL.HOP, FEAT.HOP, FEATDISPL.HOP
extended.model <- glm(HOPPINESS ~ price.ratio + DISPL.HOP + FEAT.HOP + FEATDISPL.HOP,
family = binomial, data = choice.data
)
# Marginal effects for the extended logistic response model
margins::margins(extended.model)
## Average marginal effects
## glm(formula = HOPPINESS ~ price.ratio + DISPL.HOP + FEAT.HOP + FEATDISPL.HOP, family = binomial, data = choice.data)
## price.ratio DISPL.HOP FEAT.HOP FEATDISPL.HOP
## -0.4471 0.009486 0.04973 0.1086
# Summarize the model
summary(extended.model)
##
## Call:
## glm(formula = HOPPINESS ~ price.ratio + DISPL.HOP + FEAT.HOP +
## FEATDISPL.HOP, family = binomial, data = choice.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.02676 -0.43078 -0.22935 -0.07645 3.10246
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.6250 0.1447 -25.053 < 2e-16 ***
## price.ratio -6.6663 0.4127 -16.154 < 2e-16 ***
## DISPL.HOP 0.1415 0.2599 0.544 0.586204
## FEAT.HOP 0.7415 0.3780 1.962 0.049806 *
## FEATDISPL.HOP 1.6189 0.4789 3.381 0.000723 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1820.0 on 2797 degrees of freedom
## Residual deviance: 1275.8 on 2793 degrees of freedom
## AIC: 1285.8
##
## Number of Fisher Scoring iterations: 7
# Null model explaining HOPPINESS by the intercept only
null.model <- glm(HOPPINESS ~ 1, family = binomial, data = choice.data)
# Compare null.mod against extended.mod
anova(extended.model, null.model, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: HOPPINESS ~ price.ratio + DISPL.HOP + FEAT.HOP + FEATDISPL.HOP
## Model 2: HOPPINESS ~ 1
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 2793 1275.8
## 2 2797 1820.0 -4 -544.23 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Backward elemination
final.model <- MASS::stepAIC(extended.model, direction = "backward", trace = FALSE)
# Summarize the final model
summary(final.model)
##
## Call:
## glm(formula = HOPPINESS ~ price.ratio + FEAT.HOP + FEATDISPL.HOP,
## family = binomial, data = choice.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.04400 -0.43243 -0.22914 -0.07575 3.10947
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.6268 0.1450 -25.018 < 2e-16 ***
## price.ratio -6.7167 0.4033 -16.655 < 2e-16 ***
## FEAT.HOP 0.7327 0.3780 1.938 0.052577 .
## FEATDISPL.HOP 1.6041 0.4789 3.349 0.000811 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1820.0 on 2797 degrees of freedom
## Residual deviance: 1276.1 on 2794 degrees of freedom
## AIC: 1284.1
##
## Number of Fisher Scoring iterations: 7
# Classifying the predictions
predicted <- ifelse(fitted.values(extended.model) >= 0.5, 1, 0)
# Obtain the purchase predictions
table(predicted)
## predicted
## 0 1
## 2703 95
# Obtain the observed purchases
observed <- choice.data$HOPPINESS
# Cross-tabulating the observed vs. the predicted purchases
table(predicted, observed)/2798
## observed
## predicted 0 1
## 0 0.88849178 0.07755540
## 1 0.01143674 0.02251608
# Creating the Roc object
Roc <- pROC::roc(predictor = fitted.values(extended.model), response = observed)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot the ROC curve
pROC::plot.roc(Roc)
# Create the training dataset
train.data <- subset(choice.data, LASTPURCHASE == 0)
# Create the test dataset
test.data <- subset(choice.data, LASTPURCHASE == 1)
# Fit logistic response model to the training data set
train.model <- glm(HOPPINESS ~ price.ratio + FEAT.HOP + FEATDISPL.HOP,
family = binomial, data = train.data
)
# Predict the purchase probabilities for test.data
prob <- predict(train.model, newdata=test.data, type = "response")
# Classify the predictions
predicted <- ifelse(prob >= 0.5, 1, 0)
# Obtain the observed purchases from test.data
observed <- test.data$HOPPINESS
# Cross-tabulate the predicted vs. the observed purchases
table(predicted, observed)/300
## observed
## predicted 0 1
## 0 0.923333333 0.063333333
## 1 0.006666667 0.006666667
Chapter 1 - Review of data.table
Introduction:
Flexible Data Selection:
Executing Functions Inside data.tables:
Example code includes:
library(data.table)
diagnosticDT <- as.data.table(feather::read_feather("./RInputFiles/Feather_Data/diagnosticDT.feather"))
str(diagnosticDT)
## Classes 'data.table' and 'data.frame': 100 obs. of 4 variables:
## $ timestamp : POSIXct, format: "2018-01-01 00:00:00" "2018-01-01 00:00:36" ...
## $ engine_speed : num 4325 5255 4566 5317 2739 ...
## $ engine_temp : num 58.3 100.2 146.7 100.4 87.8 ...
## $ system_voltage: num 9177 9785 8248 9027 9327 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Select system voltage directly
voltageDT <- diagnosticDT[, .(timestamp, system_voltage)]
# Select system voltage with .SD
voltageDT <- diagnosticDT[, .SD, .SDcols = c("timestamp", "system_voltage")]
# Select system voltage with .SD + a vector of names
voltage_cols <- c("timestamp", "system_voltage")
voltageDT <- diagnosticDT[, .SD, .SDcols = voltage_cols]
diagnosticDT[which.max(timestamp), .SD, .SDcols=c("timestamp", "system_voltage")]
## timestamp system_voltage
## 1: 2018-01-01 01:00:00 8031.055
# Store the names of all columns starting with "engine_" in a vector
engine_cols <- grep(pattern = "engine_", x = names(diagnosticDT), value = TRUE)
# Use that vector to create a new data.table with only engine signals
engineDT <- diagnosticDT[, .SD, .SDcols = engine_cols]
# Complete the function
add_interaction <- function(someDT, col1, col2){
new_col_name <- paste0(col1, "_times_", col2)
someDT[, (new_col_name) := get(col1) * get(col2)]
}
# Add an interaction
add_interaction(diagnosticDT, "engine_speed", "engine_temp")
# Check it out!
head(diagnosticDT)
## timestamp engine_speed engine_temp system_voltage
## 1: 2018-01-01 00:00:00 4325.4234 58.2805 9177.185
## 2: 2018-01-01 00:00:36 5254.6392 100.2430 9785.184
## 3: 2018-01-01 00:01:12 4565.8940 146.6556 8247.590
## 4: 2018-01-01 00:01:49 5316.7474 100.3676 9026.618
## 5: 2018-01-01 00:02:25 2738.8858 87.7780 9327.280
## 6: 2018-01-01 00:03:01 998.2307 137.3255 9531.084
## engine_speed_times_engine_temp
## 1: 252087.8
## 2: 526740.7
## 3: 669614.1
## 4: 533629.0
## 5: 240413.9
## 6: 137082.5
# Write a function to scale a column by 10
scale_by_10 <- function(someDT, col_to_scale, new_col_name){
someDT[, (new_col_name) := get(col_to_scale) * 10]
}
# Try it out
scale_by_10(diagnosticDT, "engine_temp", "temp10")
# Check the state of the data.table
head(diagnosticDT)
## timestamp engine_speed engine_temp system_voltage
## 1: 2018-01-01 00:00:00 4325.4234 58.2805 9177.185
## 2: 2018-01-01 00:00:36 5254.6392 100.2430 9785.184
## 3: 2018-01-01 00:01:12 4565.8940 146.6556 8247.590
## 4: 2018-01-01 00:01:49 5316.7474 100.3676 9026.618
## 5: 2018-01-01 00:02:25 2738.8858 87.7780 9327.280
## 6: 2018-01-01 00:03:01 998.2307 137.3255 9531.084
## engine_speed_times_engine_temp temp10
## 1: 252087.8 582.805
## 2: 526740.7 1002.430
## 3: 669614.1 1466.556
## 4: 533629.0 1003.676
## 5: 240413.9 877.780
## 6: 137082.5 1373.255
# Write a function that squares every numeric column
add_square_features <- function(someDT, cols){
for (col_name in cols){
new_col_name <- paste0(col_name, "_squared")
someDT[, (new_col_name) := get(col_name)^2 ]
}
}
# Look at the difference!
add_square_features(diagnosticDT, c("engine_speed", "engine_temp", "system_voltage"))
head(diagnosticDT)
## timestamp engine_speed engine_temp system_voltage
## 1: 2018-01-01 00:00:00 4325.4234 58.2805 9177.185
## 2: 2018-01-01 00:00:36 5254.6392 100.2430 9785.184
## 3: 2018-01-01 00:01:12 4565.8940 146.6556 8247.590
## 4: 2018-01-01 00:01:49 5316.7474 100.3676 9026.618
## 5: 2018-01-01 00:02:25 2738.8858 87.7780 9327.280
## 6: 2018-01-01 00:03:01 998.2307 137.3255 9531.084
## engine_speed_times_engine_temp temp10 engine_speed_squared
## 1: 252087.8 582.805 18709287.4
## 2: 526740.7 1002.430 27611232.7
## 3: 669614.1 1466.556 20847387.7
## 4: 533629.0 1003.676 28267802.9
## 5: 240413.9 877.780 7501495.2
## 6: 137082.5 1373.255 996464.6
## engine_temp_squared system_voltage_squared
## 1: 3396.617 84220718
## 2: 10048.654 95749817
## 3: 21507.876 68022738
## 4: 10073.648 81479833
## 5: 7704.976 86998160
## 6: 18858.291 90841562
# Change names
setnames(diagnosticDT, old = c("timestamp"), new = "obs_time")
# Tag all the numeric columns with "_NUMERIC"
tag_numeric_cols <- function(DT, cols){
setnames(DT, old = cols, new = paste0(cols, "_NUMERIC"))
}
# Tag numeric columns
tag_numeric_cols(diagnosticDT, c("engine_speed", "engine_temp", "system_voltage"))
diagnosticDT <- as.data.table(feather::read_feather("./RInputFiles/Feather_Data/diagnosticDT.feather"))
str(diagnosticDT)
## Classes 'data.table' and 'data.frame': 100 obs. of 4 variables:
## $ timestamp : POSIXct, format: "2018-01-01 00:00:00" "2018-01-01 00:00:36" ...
## $ engine_speed : num 4325 5255 4566 5317 2739 ...
## $ engine_temp : num 58.3 100.2 146.7 100.4 87.8 ...
## $ system_voltage: num 9177 9785 8248 9027 9327 ...
## - attr(*, ".internal.selfref")=<externalptr>
diagnosticDT2 <- as.data.table(feather::read_feather("./RInputFiles/Feather_Data/diagnosticDT.feather"))
str(diagnosticDT2)
## Classes 'data.table' and 'data.frame': 100 obs. of 4 variables:
## $ timestamp : POSIXct, format: "2018-01-01 00:00:00" "2018-01-01 00:00:36" ...
## $ engine_speed : num 4325 5255 4566 5317 2739 ...
## $ engine_temp : num 58.3 100.2 146.7 100.4 87.8 ...
## $ system_voltage: num 9177 9785 8248 9027 9327 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Mean of engine temp
diagnosticDT[, mean(engine_temp)]
## [1] 94.49417
# Correlation between engine_temp and system_voltage
diagnosticDT[, cor(engine_temp, system_voltage)]
## [1] 0.06351799
# Get classes of column names
correlations <- function(DT){
# Find numeric columns
num_cols <- diagnosticDT[, sapply(.SD, is.numeric)]
numeric_cols <- names(diagnosticDT)[num_cols]
return(DT[, cor(.SD), .SDcols = numeric_cols])
}
# Mean of system voltage
diagnosticDT[, lapply(.SD, mean), .SDcols = c("system_voltage")]
## system_voltage
## 1: 9030.28
# Mean of all engine cols
engine_cols <- c("engine_speed", "engine_temp")
meanDT <- diagnosticDT[, lapply(.SD, mean), .SDcols = engine_cols]
print(meanDT)
## engine_speed engine_temp
## 1: 3056.395 94.49417
get_numeric_cols <- function(DT){
num_cols <- DT[, sapply(.SD, is.numeric)]
return(names(DT)[num_cols])
}
# Function to get correlation matrix from a data.table
corrmat_from_dt <- function(DT){
numeric_cols <- get_numeric_cols(DT)
return(DT[, cor(.SD), .SDcols = numeric_cols])
}
# Get correlation matrices
corrmat_from_dt(diagnosticDT)
## engine_speed engine_temp system_voltage
## engine_speed 1.0000000000 0.0001975067 0.16182688
## engine_temp 0.0001975067 1.0000000000 0.06351799
## system_voltage 0.1618268766 0.0635179886 1.00000000
corrmat_from_dt(diagnosticDT2)
## engine_speed engine_temp system_voltage
## engine_speed 1.0000000000 0.0001975067 0.16182688
## engine_temp 0.0001975067 1.0000000000 0.06351799
## system_voltage 0.1618268766 0.0635179886 1.00000000
Chapter 2 - Getting Time Series Data into data.table
Overview of the POSIXct Type:
Creating data.tables from vectors:
Coercing from xts:
Combining datasets with merge() and rbindlist():
Example code includes:
excelDT <- data.table(timecol=42885:42889, sales=c(105, 92, 500, 81, 230))
stringDT <- data.table(timecol=c("2017-06-01", "2017-06-02", "2017-06-03", "2017-06-04", "2017-06-05"),
sales=c(105, 92, 500, 81, 230)
)
epochSecondsDT <- data.table(timecol=1496275200 + 24*60*60*0:4, sales=c(105, 92, 500, 81, 230))
epochMillisDT <- data.table(timecol=1000 * c(1496275200 + 24*60*60*0:4), sales=c(105, 92, 500, 81, 230))
# Create POSIXct dates from a hypothetical Excel dataset
excelDT[, posix := as.POSIXct(as.Date(timecol, origin = "1900-01-01"), tz = "UTC")]
# Convert strings to POSIXct
stringDT[, posix := as.POSIXct(timecol, tz = "UTC")]
# Convert epoch seconds to POSIXct
epochSecondsDT[, posix := as.POSIXct(timecol, tz = "UTC", origin = "1970-01-01")]
# Convert epoch milliseconds to POSIXct
epochMillisDT[, posix := as.POSIXct(timecol/1000, origin = "1970-01-01", tz="UTC")]
stringDT <- data.table(timecol1=c("2017-06-01 10", "2017-06-02 5", "2017-06-03 10",
"2017-06-04 7", "2017-06-05 9"
),
timecol2=c("06-01-2017 10:00:00", "06-02-2017 05:00:00", "06-03-2017 10:00:00",
"06-05-2017 07:00:00", "06-04-2017 09:00:00"
),
sales=c(105, 92, 500, 81, 230)
)
stringDT
## timecol1 timecol2 sales
## 1: 2017-06-01 10 06-01-2017 10:00:00 105
## 2: 2017-06-02 5 06-02-2017 05:00:00 92
## 3: 2017-06-03 10 06-03-2017 10:00:00 500
## 4: 2017-06-04 7 06-05-2017 07:00:00 81
## 5: 2017-06-05 9 06-04-2017 09:00:00 230
# Convert timecol1
str(stringDT)
## Classes 'data.table' and 'data.frame': 5 obs. of 3 variables:
## $ timecol1: chr "2017-06-01 10" "2017-06-02 5" "2017-06-03 10" "2017-06-04 7" ...
## $ timecol2: chr "06-01-2017 10:00:00" "06-02-2017 05:00:00" "06-03-2017 10:00:00" "06-05-2017 07:00:00" ...
## $ sales : num 105 92 500 81 230
## - attr(*, ".internal.selfref")=<externalptr>
stringDT[, posix1 := lubridate::ymd_h(timecol1)]
str(stringDT)
## Classes 'data.table' and 'data.frame': 5 obs. of 4 variables:
## $ timecol1: chr "2017-06-01 10" "2017-06-02 5" "2017-06-03 10" "2017-06-04 7" ...
## $ timecol2: chr "06-01-2017 10:00:00" "06-02-2017 05:00:00" "06-03-2017 10:00:00" "06-05-2017 07:00:00" ...
## $ sales : num 105 92 500 81 230
## $ posix1 : POSIXct, format: "2017-06-01 10:00:00" "2017-06-02 05:00:00" ...
## - attr(*, ".internal.selfref")=<externalptr>
# Convert timecol2
str(stringDT)
## Classes 'data.table' and 'data.frame': 5 obs. of 4 variables:
## $ timecol1: chr "2017-06-01 10" "2017-06-02 5" "2017-06-03 10" "2017-06-04 7" ...
## $ timecol2: chr "06-01-2017 10:00:00" "06-02-2017 05:00:00" "06-03-2017 10:00:00" "06-05-2017 07:00:00" ...
## $ sales : num 105 92 500 81 230
## $ posix1 : POSIXct, format: "2017-06-01 10:00:00" "2017-06-02 05:00:00" ...
## - attr(*, ".internal.selfref")=<externalptr>
stringDT[, posix2 := lubridate::mdy_hms(timecol2)]
str(stringDT)
## Classes 'data.table' and 'data.frame': 5 obs. of 5 variables:
## $ timecol1: chr "2017-06-01 10" "2017-06-02 5" "2017-06-03 10" "2017-06-04 7" ...
## $ timecol2: chr "06-01-2017 10:00:00" "06-02-2017 05:00:00" "06-03-2017 10:00:00" "06-05-2017 07:00:00" ...
## $ sales : num 105 92 500 81 230
## $ posix1 : POSIXct, format: "2017-06-01 10:00:00" "2017-06-02 05:00:00" ...
## $ posix2 : POSIXct, format: "2017-06-01 10:00:00" "2017-06-02 05:00:00" ...
## - attr(*, ".internal.selfref")=<externalptr>
# Generate a series of dates
march_dates <- seq.POSIXt(as.POSIXct("2017-03-01", tz="UTC"),
as.POSIXct("2017-03-31", tz="UTC"),
length.out = 31
)
# Generate hourly data
hourly_times <- seq.POSIXt(as.POSIXct("2017-05-01 00:00:00", tz="UTC"),
as.POSIXct("2017-05-02 00:00:00", tz="UTC"),
length.out = 1 + 24
)
# Generate sample IoT data
iotDT <- data.table(timestamp = seq.POSIXt(as.POSIXct("2016-04-19 00:00:00", tz="UTC"),
as.POSIXct("2016-04-20 00:00:00", tz="UTC"),
length.out = 1 + 24
),
engine_temp = rnorm(n=1+24),
ambient_temp = rnorm(n=1+24)
)
head(iotDT)
## timestamp engine_temp ambient_temp
## 1: 2016-04-19 00:00:00 0.070400856 1.3290296
## 2: 2016-04-19 01:00:00 -0.692487882 -0.1275896
## 3: 2016-04-19 02:00:00 0.523005842 1.1565826
## 4: 2016-04-19 03:00:00 -1.316436702 0.4565383
## 5: 2016-04-19 04:00:00 -1.516521174 0.3326634
## 6: 2016-04-19 05:00:00 -0.006312561 -1.5452930
# Create a 500-row data.table
start_date <- "2016-01-01"
end_date <- "2018-01-01"
someDT <- data.table(timestamp = seq.POSIXt(as.POSIXct(start_date),
as.POSIXct(end_date),
length.out = 500
)
)
# Function to add random columns
add_random_cols <- function(DT, colnames){
for (colname in colnames){
DT[, (colname) := rnorm(n = .N)]
}
}
# Check out the new data.table
add_random_cols(someDT, c("copper", "chopper", "stopper"))
# Simulated data
some_data <- rnorm(100)
some_dates <- seq.POSIXt(from = as.POSIXct("2017-06-15 00:00:00Z", tz = "UTC"),
to = as.POSIXct("2017-06-15 01:00:00Z", tz = "UTC"),
length.out = 100
)
# Make your own 'xts' object
myXTS <- xts::xts(some_data, order.by=some_dates)
# View the timezone
print(attr(myXTS, "tzone"))
## NULL
nickelXTS <- readRDS("./RInputFiles/nickelXTS.rds")
# All observations after 2018-01-01 00:45:00
fifteenXTS <- nickelXTS["2018-01-01 00:45:00/"]
# Check the structure
str(fifteenXTS)
## An 'xts' object on 2018-01-01 00:45:27/2018-01-01 01:00:00 containing:
## Data: num [1:25, 1] 14764 13465 13095 14700 13332 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr "nickel_price"
## Indexed by objects of class: [POSIXct,POSIXt] TZ: UTC
## xts Attributes:
## NULL
# 10-minute aggregations
tenMinuteXTS <- xts::to.minutes10(nickelXTS)
print(tenMinuteXTS)
## Warning: timezone of object (UTC) is different than current timezone ().
## nickelXTS.Open nickelXTS.High nickelXTS.Low nickelXTS.Close
## 2018-01-01 00:09:41 14986.29 14986.29 13263.81 14642.82
## 2018-01-01 00:19:23 14682.81 14700.31 13253.49 14070.30
## 2018-01-01 00:29:41 13561.37 14975.33 13131.90 13879.25
## 2018-01-01 00:39:23 14295.35 14956.31 13388.33 14935.83
## 2018-01-01 00:49:41 13426.89 14982.86 13095.23 14023.63
## 2018-01-01 00:59:23 13659.71 14828.94 13104.92 14828.94
## 2018-01-01 01:00:00 14780.57 14780.57 14780.57 14780.57
# 1-minute aggregations
oneMinuteXTS <- xts::to.minutes(nickelXTS)
# Convert to a data.table
nickelDT <- as.data.table(nickelXTS)
str(nickelDT)
## Classes 'data.table' and 'data.frame': 100 obs. of 2 variables:
## $ index : POSIXct, format: "2018-01-01 00:00:00" "2018-01-01 00:00:36" ...
## $ nickel_price: num 14986 14132 14353 14798 14303 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Change names
setnames(nickelDT, old="index", new="spot_price_timestamp")
print(nickelDT)
## spot_price_timestamp nickel_price
## 1: 2018-01-01 00:00:00 14986.29
## 2: 2018-01-01 00:00:36 14131.57
## 3: 2018-01-01 00:01:12 14352.72
## 4: 2018-01-01 00:01:49 14797.59
## 5: 2018-01-01 00:02:25 14303.15
## 6: 2018-01-01 00:03:01 13692.15
## 7: 2018-01-01 00:03:38 14265.23
## 8: 2018-01-01 00:04:14 13322.13
## 9: 2018-01-01 00:04:50 14149.04
## 10: 2018-01-01 00:05:27 13263.81
## 11: 2018-01-01 00:06:03 14038.89
## 12: 2018-01-01 00:06:40 13380.40
## 13: 2018-01-01 00:07:16 13968.63
## 14: 2018-01-01 00:07:52 13671.97
## 15: 2018-01-01 00:08:29 13620.77
## 16: 2018-01-01 00:09:05 14815.78
## 17: 2018-01-01 00:09:41 14642.82
## 18: 2018-01-01 00:10:18 14682.81
## 19: 2018-01-01 00:10:54 13846.74
## 20: 2018-01-01 00:11:30 13954.12
## 21: 2018-01-01 00:12:07 13953.37
## 22: 2018-01-01 00:12:43 13785.52
## 23: 2018-01-01 00:13:20 14042.39
## 24: 2018-01-01 00:13:56 14700.31
## 25: 2018-01-01 00:14:32 13978.34
## 26: 2018-01-01 00:15:09 13940.36
## 27: 2018-01-01 00:15:45 14144.94
## 28: 2018-01-01 00:16:21 13390.14
## 29: 2018-01-01 00:16:58 13253.49
## 30: 2018-01-01 00:17:34 14510.67
## 31: 2018-01-01 00:18:10 13581.22
## 32: 2018-01-01 00:18:47 14518.28
## 33: 2018-01-01 00:19:23 14070.30
## 34: 2018-01-01 00:20:00 13561.37
## 35: 2018-01-01 00:20:36 13831.70
## 36: 2018-01-01 00:21:12 13131.90
## 37: 2018-01-01 00:21:49 13224.83
## 38: 2018-01-01 00:22:25 14394.62
## 39: 2018-01-01 00:23:01 13763.71
## 40: 2018-01-01 00:23:38 14624.69
## 41: 2018-01-01 00:24:14 14785.23
## 42: 2018-01-01 00:24:50 13140.53
## 43: 2018-01-01 00:25:27 14030.86
## 44: 2018-01-01 00:26:03 13503.92
## 45: 2018-01-01 00:26:40 14388.48
## 46: 2018-01-01 00:27:16 14975.33
## 47: 2018-01-01 00:27:52 14130.33
## 48: 2018-01-01 00:28:29 13278.99
## 49: 2018-01-01 00:29:05 14868.12
## 50: 2018-01-01 00:29:41 13879.25
## 51: 2018-01-01 00:30:18 14295.35
## 52: 2018-01-01 00:30:54 14944.84
## 53: 2018-01-01 00:31:30 14739.03
## 54: 2018-01-01 00:32:07 14859.15
## 55: 2018-01-01 00:32:43 14365.30
## 56: 2018-01-01 00:33:20 14123.30
## 57: 2018-01-01 00:33:56 13838.13
## 58: 2018-01-01 00:34:32 14772.30
## 59: 2018-01-01 00:35:09 14956.31
## 60: 2018-01-01 00:35:45 14342.06
## 61: 2018-01-01 00:36:21 14886.71
## 62: 2018-01-01 00:36:58 14384.66
## 63: 2018-01-01 00:37:34 13605.72
## 64: 2018-01-01 00:38:10 13388.33
## 65: 2018-01-01 00:38:47 13991.27
## 66: 2018-01-01 00:39:23 14935.83
## 67: 2018-01-01 00:40:00 13426.89
## 68: 2018-01-01 00:40:36 14425.18
## 69: 2018-01-01 00:41:12 14982.86
## 70: 2018-01-01 00:41:49 14979.89
## 71: 2018-01-01 00:42:25 13266.64
## 72: 2018-01-01 00:43:01 13128.31
## 73: 2018-01-01 00:43:38 13227.89
## 74: 2018-01-01 00:44:14 14874.00
## 75: 2018-01-01 00:44:50 14266.51
## 76: 2018-01-01 00:45:27 14763.78
## 77: 2018-01-01 00:46:03 13464.51
## 78: 2018-01-01 00:46:40 13095.23
## 79: 2018-01-01 00:47:16 14700.34
## 80: 2018-01-01 00:47:52 13331.94
## 81: 2018-01-01 00:48:29 13134.86
## 82: 2018-01-01 00:49:05 13367.25
## 83: 2018-01-01 00:49:41 14023.63
## 84: 2018-01-01 00:50:18 13659.71
## 85: 2018-01-01 00:50:54 13414.85
## 86: 2018-01-01 00:51:30 13193.81
## 87: 2018-01-01 00:52:07 13104.92
## 88: 2018-01-01 00:52:43 13188.39
## 89: 2018-01-01 00:53:20 14038.85
## 90: 2018-01-01 00:53:56 14149.33
## 91: 2018-01-01 00:54:32 13628.37
## 92: 2018-01-01 00:55:09 13676.95
## 93: 2018-01-01 00:55:45 14365.72
## 94: 2018-01-01 00:56:21 13696.46
## 95: 2018-01-01 00:56:58 14249.07
## 96: 2018-01-01 00:57:34 14666.98
## 97: 2018-01-01 00:58:10 14298.75
## 98: 2018-01-01 00:58:47 14419.84
## 99: 2018-01-01 00:59:23 14828.94
## 100: 2018-01-01 01:00:00 14780.57
## spot_price_timestamp nickel_price
treasuryDT <- data.table(timestamp=as.POSIXct("2018-03-01 00:00:00", tz="UTC") + 0:4 + 0.001,
treasury_10y=c(0.71, 0.8, 0.78, 0.77, 0.73)
)
oilDT <- data.table(timestamp=as.POSIXct("2018-03-01 00:00:00", tz="UTC") + 0:4,
oil=c(44.07, 44.15, 44.14, 44.06, 44.09)
)
# Naive approach (merge on timestamp)
newDT <- merge(treasuryDT, oilDT, on = "timestamp")
str(newDT)
## Classes 'data.table' and 'data.frame': 0 obs. of 3 variables:
## $ timestamp : 'POSIXct' num(0)
## - attr(*, "tzone")= chr "UTC"
## $ treasury_10y: num
## $ oil : num
## - attr(*, ".internal.selfref")=<externalptr>
# Check out the precision
treasuryDT[, as.numeric(timestamp)]
## [1] 1519862400 1519862401 1519862402 1519862403 1519862404
oilDT[, as.numeric(timestamp)]
## [1] 1519862400 1519862401 1519862402 1519862403 1519862404
# Clean up and merge
treasuryDT[, timestamp := as.POSIXct(round(as.numeric(timestamp)), origin = "1970-01-01")]
newDT <- merge(treasuryDT, oilDT, on = "timestamp")
str(newDT)
## Classes 'data.table' and 'data.frame': 5 obs. of 3 variables:
## $ timestamp : POSIXct, format: "2018-02-28 18:00:00" "2018-02-28 18:00:01" ...
## $ treasury_10y: num 0.71 0.8 0.78 0.77 0.73
## $ oil : num 44.1 44.1 44.1 44.1 44.1
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "timestamp"
# Add grouping indicator
# fxDT[, yearmonth := paste0(year(timestamp), "_", month(timestamp))]
# exportDT[, yearmonth := paste0(year(timestamp), "_", month(timestamp))]
# Monthly exchange rate
# monthlyFXDT <- fxDT[, .(exch_rate = mean(exchange_rate)), by = yearmonth]
# Merge
# merge(exportDT, monthlyFXDT, by="yearmonth")
Chapter 3 - Generating Lags, Differences, and Windowed Aggregations
Generating Lags:
Generating Growth Rates and Differences:
Windowing with j and by:
Example code includes:
dailyDT <- data.table(timestamp=as.POSIXct("2018-08-01", tz="UTC") + lubridate::days(0:152),
sales=c(483.08, 449.25, 523.6, 498.36, 448, 487.03, 502.91, 475.69, 535.39, 471.54, 494.57, 509.6, 538.43, 603.55, 560.84, 482.39, 456.42, 550.68, 526.83, 577.16, 515.7, 450.44, 522.18, 546.44, 530.86, 452.47, 498.56, 486.58, 523.58, 424.25, 587.53, 533.11, 477.74, 582.16, 449.59, 575.78, 523.92, 475.5, 556.5, 487.27, 515.98, 523.78, 528.1, 548.19, 484.26, 542.97, 540.72, 475.16, 483.19, 598.89, 419.74, 448.57, 494.05, 438.82, 460.1, 343.01, 525.2, 527.51, 461.07, 557.52, 577.24, 499.41, 431.83, 487.61, 412.54, 454.56, 471.44, 520.61, 519.03, 547.59, 541.78, 507.67, 448.96, 468.08, 494.02, 520.78, 442.87, 507.98, 553.78, 486.46, 476.9, 546.92, 502.69, 557.93, 445.11, 501.94, 491.04, 534.49, 533.16, 543.76, 484.38, 610.28, 528.18, 483.56, 509.4, 496.62, 439.98, 488.11, 475.01, 514.07, 567.83, 506.74, 496.28, 417.83, 499.35, 556.75, 511, 596.06, 537.87, 562.97, 496.55, 499.85, 460.23, 478.96, 451.44, 576.34, 466.04, 433.66, 530.26, 554.76, 469.11, 477.79, 542.85, 582.55, 464.29, 458.92, 585.33, 487.18, 576.68, 488.35, 441.12, 509.81, 464.99, 464, 506.53, 459.36, 554.13, 444, 436.21, 528.15, 480.87, 541.93, 496.3, 423.1, 546.8, 499.21, 543.36, 534.85, 523.89, 524.99, 522.67, 524.51, 502.6)
)
str(dailyDT)
## Classes 'data.table' and 'data.frame': 153 obs. of 2 variables:
## $ timestamp: POSIXct, format: "2018-08-01" "2018-08-02" ...
## $ sales : num 483 449 524 498 448 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Sort by time
setorderv(dailyDT, "timestamp")
# 1-day lag
dailyDT[, sales_lag1 := shift(sales, type = "lag", n = 1)]
# 5-day lag
dailyDT[, sales_lag5 := shift(sales, type = "lag", n = 5)]
experimentDT <- data.table(day=c(1:3, 1:3),
result=c(1, 3.3, 2.5, 1.1, 3.9, 3.8),
subject_id=LETTERS[c(1, 1, 1, 2, 2, 2)]
)
experimentDT
## day result subject_id
## 1: 1 1.0 A
## 2: 2 3.3 A
## 3: 3 2.5 A
## 4: 1 1.1 B
## 5: 2 3.9 B
## 6: 3 3.8 B
# Yesterday
experimentDT[, yesterday := shift(result, type="lag", n=1), by=subject_id]
# Two days ago
experimentDT[, two_days_ago := shift(result, type="lag", n=2), by=subject_id]
# Preview experimentDT
print(experimentDT)
## day result subject_id yesterday two_days_ago
## 1: 1 1.0 A NA NA
## 2: 2 3.3 A 1.0 NA
## 3: 3 2.5 A 3.3 1.0
## 4: 1 1.1 B NA NA
## 5: 2 3.9 B 1.1 NA
## 6: 3 3.8 B 3.9 1.1
aluminumDT <- as.data.table(feather::read_feather("./RInputFiles/Feather_Data/aluminumDF.feather") %>%
rename(timestamp=Date, price=`Cash Buyer`) %>%
select(timestamp, price)
)
str(aluminumDT)
## Classes 'data.table' and 'data.frame': 1552 obs. of 2 variables:
## $ timestamp: Date, format: "2018-03-12" "2018-03-09" ...
## $ price : num 2096 2078 2082 2112 2136 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Add 1-period and 2-period lags
aluminumDT[, lag1 := shift(price, type = "lag", n = 1)]
aluminumDT[, lag2 := shift(price, type = "lag", n = 2)]
# Fit models with 1 and 2 lags
mod1 <- lm(price ~ lag1, data = aluminumDT)
mod2 <- lm(price ~ lag1 + lag2, data = aluminumDT)
# Compare
stargazer::stargazer(list(mod1, mod2), type = "text")
##
## ==============================================================================
## Dependent variable:
## ----------------------------------------------------------
## price
## (1) (2)
## ------------------------------------------------------------------------------
## lag1 0.994*** 1.013***
## (0.003) (0.025)
##
## lag2 -0.019
## (0.025)
##
## Constant 10.691** 10.753**
## (4.934) (4.944)
##
## ------------------------------------------------------------------------------
## Observations 1,551 1,550
## R2 0.989 0.989
## Adjusted R2 0.989 0.989
## Residual Std. Error 20.854 (df = 1549) 20.860 (df = 1547)
## F Statistic 138,844.900*** (df = 1; 1549) 69,320.790*** (df = 2; 1547)
## ==============================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# One-period lag
dailyDT[, sales_lag1 := shift(sales, type = "lag", n = 1)]
# One-period diff
dailyDT[, sales_diff1 := sales - sales_lag1]
# Two-period diff
dailyDT[, sales_diff2 := sales - shift(sales, type="lag", n=2)]
# Add 1-period percentage change
dailyDT[, sales_pctchng1 := sales_diff1 / sales_lag1]
# Add 2-period percentage change
dailyDT[, sales_pctchng2 := (sales / shift(sales, type="lag", n=2) - 1)]
passengerDT <- data.table(obs_time=lubridate::ymd_hms("2017-08-01 00:00:00") + lubridate::minutes(15*0:96),
passengers=c(506, 513, 554, 427, 439, 476, 509, 382, 457, 498, 398, 385, 529, 442, 393, 500, 557, 439, 453, 488, 520, 546, 542, 492, 528, 493, 498, 530, 515, 537, 535, 518, 396, 623, 499, 467, 523, 499, 535, 383, 546, 552, 436, 556, 452, 512, 514, 476, 437, 432, 522, 492, 537, 480, 543, 485, 491, 512, 555, 498, 452, 502, 514, 452, 446, 458, 538, 414, 499, 433, 503, 466, 553, 473, 473, 546, 447, 545, 492, 554, 466, 618, 530, 568, 541, 433, 524, 433, 571, 506, 485, 466, 490, 467, 528, 427, 480)
)
str(passengerDT)
## Classes 'data.table' and 'data.frame': 97 obs. of 2 variables:
## $ obs_time : POSIXct, format: "2017-08-01 00:00:00" "2017-08-01 00:15:00" ...
## $ passengers: num 506 513 554 427 439 476 509 382 457 498 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Generation time in seconds
passengerDT[, obs_time_in_seconds := as.numeric(obs_time)]
# Add floor time for each time stamp
seconds_in_an_hour <- 60 * 60
passengerDT[, hour_end := floor(obs_time_in_seconds / seconds_in_an_hour)]
# Count number of observations in each hour
passengerDT[, .N, by = hour_end]
## hour_end N
## 1: 417096 4
## 2: 417097 4
## 3: 417098 4
## 4: 417099 4
## 5: 417100 4
## 6: 417101 4
## 7: 417102 4
## 8: 417103 4
## 9: 417104 4
## 10: 417105 4
## 11: 417106 4
## 12: 417107 4
## 13: 417108 4
## 14: 417109 4
## 15: 417110 4
## 16: 417111 4
## 17: 417112 4
## 18: 417113 4
## 19: 417114 4
## 20: 417115 4
## 21: 417116 4
## 22: 417117 4
## 23: 417118 4
## 24: 417119 4
## 25: 417120 1
## hour_end N
# Mean passengers per hour
passengerDT[, mean(passengers), by=hour_end]
## hour_end V1
## 1: 417096 500.00
## 2: 417097 451.50
## 3: 417098 434.50
## 4: 417099 466.00
## 5: 417100 484.25
## 6: 417101 525.00
## 7: 417102 512.25
## 8: 417103 526.25
## 9: 417104 496.25
## 10: 417105 485.00
## 11: 417106 522.50
## 12: 417107 488.50
## 13: 417108 470.75
## 14: 417109 511.25
## 15: 417110 514.00
## 16: 417111 480.00
## 17: 417112 464.00
## 18: 417113 475.25
## 19: 417114 511.25
## 20: 417115 509.50
## 21: 417116 545.50
## 22: 417117 482.75
## 23: 417118 507.00
## 24: 417119 478.00
## 25: 417120 480.00
## hour_end V1
# Cleaner names
passengerDT[, .(mean_passengers = mean(passengers)), by=hour_end]
## hour_end mean_passengers
## 1: 417096 500.00
## 2: 417097 451.50
## 3: 417098 434.50
## 4: 417099 466.00
## 5: 417100 484.25
## 6: 417101 525.00
## 7: 417102 512.25
## 8: 417103 526.25
## 9: 417104 496.25
## 10: 417105 485.00
## 11: 417106 522.50
## 12: 417107 488.50
## 13: 417108 470.75
## 14: 417109 511.25
## 15: 417110 514.00
## 16: 417111 480.00
## 17: 417112 464.00
## 18: 417113 475.25
## 19: 417114 511.25
## 20: 417115 509.50
## 21: 417116 545.50
## 22: 417117 482.75
## 23: 417118 507.00
## 24: 417119 478.00
## 25: 417120 480.00
## hour_end mean_passengers
# Generate hourly summary statistics
passengerDT[, .(min_passengers = min(passengers), max_passengers = max(passengers)), by=hour_end]
## hour_end min_passengers max_passengers
## 1: 417096 427 554
## 2: 417097 382 509
## 3: 417098 385 498
## 4: 417099 393 529
## 5: 417100 439 557
## 6: 417101 492 546
## 7: 417102 493 530
## 8: 417103 515 537
## 9: 417104 396 623
## 10: 417105 383 535
## 11: 417106 436 556
## 12: 417107 452 514
## 13: 417108 432 522
## 14: 417109 480 543
## 15: 417110 491 555
## 16: 417111 452 514
## 17: 417112 414 538
## 18: 417113 433 503
## 19: 417114 473 553
## 20: 417115 447 554
## 21: 417116 466 618
## 22: 417117 433 541
## 23: 417118 466 571
## 24: 417119 427 528
## 25: 417120 480 480
## hour_end min_passengers max_passengers
Chapter 4 - Case Study: Financial Data
Modeling Metals Prices:
Cash Seller & Settlement )] # The .() allows for both selecting by name and changing namesCash Seller & Settlement )]Time Series Feature Engineering:
EDA and Model Building:
Wrap Up:
Example code includes:
copperDT <- as.data.table(feather::read_feather("./RInputFiles/Feather_Data/copperDF.feather"))
str(copperDT)
## Classes 'data.table' and 'data.frame': 1546 obs. of 13 variables:
## $ Date : Date, format: "2018-03-12" "2018-03-09" ...
## $ Cash Buyer : num 6856 6806 6828 6872 6968 ...
## $ Cash Seller & Settlement: num 6857 6808 6830 6873 6968 ...
## $ 3-months Buyer : num 6895 6838 6855 6915 7003 ...
## $ 3-months Seller : num 6900 6839 6860 6916 7004 ...
## $ 15-months Buyer : num NA NA NA NA NA NA NA NA NA NA ...
## $ 15-months Seller : num NA NA NA NA NA NA NA NA NA NA ...
## $ Dec 1 Buyer : num 7020 6965 6975 7025 7100 ...
## $ Dec 1 Seller : num 7030 6975 6985 7035 7110 ...
## $ Dec 2 Buyer : num 7025 6975 6975 7020 7095 ...
## $ Dec 2 Seller : num 7035 6985 6985 7030 7105 ...
## $ Dec 3 Buyer : num 7005 6955 6955 7000 7075 ...
## $ Dec 3 Seller : num 7015 6965 6965 7010 7085 ...
## - attr(*, ".internal.selfref")=<externalptr>
nickelDT <- as.data.table(feather::read_feather("./RInputFiles/Feather_Data/nickelDF.feather"))
str(nickelDT)
## Classes 'data.table' and 'data.frame': 1552 obs. of 13 variables:
## $ Date : Date, format: "2018-03-12" "2018-03-09" ...
## $ Cash Buyer : num 13720 13335 13240 13350 13575 ...
## $ Cash Seller & Settlement: num 13725 13345 13250 13370 13580 ...
## $ 3-months Buyer : num 13750 13385 13295 13400 13575 ...
## $ 3-months Seller : num 13800 13390 13300 13420 13580 ...
## $ 15-months Buyer : num NA NA NA NA NA NA NA NA NA NA ...
## $ 15-months Seller : num NA NA NA NA NA NA NA NA NA NA ...
## $ Dec 1 Buyer : num 14035 13655 13570 13670 13840 ...
## $ Dec 1 Seller : num 14085 13705 13620 13720 13890 ...
## $ Dec 2 Buyer : num 14190 13810 13730 13830 14000 ...
## $ Dec 2 Seller : num 14240 13860 13780 13880 14050 ...
## $ Dec 3 Buyer : num 14315 13935 13855 13955 14125 ...
## $ Dec 3 Seller : num 14365 13985 13905 14005 14175 ...
## - attr(*, ".internal.selfref")=<externalptr>
cobaltDT <- as.data.table(feather::read_feather("./RInputFiles/Feather_Data/cobaltDF.feather"))
str(cobaltDT)
## Classes 'data.table' and 'data.frame': 1551 obs. of 7 variables:
## $ Date : Date, format: "2018-03-12" "2018-03-09" ...
## $ Cash Buyer : num 18000 17500 17000 16500 16000 15500 15500 15500 15500 15500 ...
## $ Cash Seller & Settlement: num 18500 19500 19000 18500 18000 16000 16000 16000 16000 16000 ...
## $ 3-months Buyer : num 18000 17500 17000 16500 16000 15500 15500 15500 15500 15500 ...
## $ 3-months Seller : num 18500 19500 19000 18500 18000 16000 16000 16000 16000 16000 ...
## $ 15-months Buyer : num 18235 18490 17990 17500 17000 ...
## $ 15-months Seller : num 19235 19490 18990 18500 18000 ...
## - attr(*, ".internal.selfref")=<externalptr>
tinDT <- as.data.table(feather::read_feather("./RInputFiles/Feather_Data/tinDF.feather"))
str(tinDT)
## Classes 'data.table' and 'data.frame': 1552 obs. of 13 variables:
## $ Date : Date, format: "2018-03-12" "2018-03-09" ...
## $ Cash Buyer : num 21475 21325 21625 21490 21585 ...
## $ Cash Seller & Settlement: num 21500 21375 21650 21495 21595 ...
## $ 3-months Buyer : num 21355 21255 21525 21335 21450 ...
## $ 3-months Seller : num 21360 21260 21550 21340 21455 ...
## $ 15-months Buyer : num 21050 20960 21240 21030 21145 ...
## $ 15-months Seller : num 21100 21010 21290 21080 21195 ...
## $ Dec 1 Buyer : num NA NA NA NA NA NA NA NA NA NA ...
## $ Dec 1 Seller : num NA NA NA NA NA NA NA NA NA NA ...
## $ Dec 2 Buyer : num NA NA NA NA NA NA NA NA NA NA ...
## $ Dec 2 Seller : num NA NA NA NA NA NA NA NA NA NA ...
## $ Dec 3 Buyer : num NA NA NA NA NA NA NA NA NA NA ...
## $ Dec 3 Seller : num NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, ".internal.selfref")=<externalptr>
aluminumDT <- as.data.table(feather::read_feather("./RInputFiles/Feather_Data/aluminumDF.feather"))
str(aluminumDT)
## Classes 'data.table' and 'data.frame': 1552 obs. of 13 variables:
## $ Date : Date, format: "2018-03-12" "2018-03-09" ...
## $ Cash Buyer : num 2096 2078 2082 2112 2136 ...
## $ Cash Seller & Settlement: num 2097 2078 2082 2112 2136 ...
## $ 3-months Buyer : num 2117 2098 2104 2132 2154 ...
## $ 3-months Seller : num 2118 2099 2104 2132 2155 ...
## $ 15-months Buyer : num NA NA NA NA NA NA NA NA NA NA ...
## $ 15-months Seller : num NA NA NA NA NA NA NA NA NA NA ...
## $ Dec 1 Buyer : num 2168 2148 2150 2177 2192 ...
## $ Dec 1 Seller : num 2173 2153 2155 2182 2197 ...
## $ Dec 2 Buyer : num 2188 2168 2172 2195 2210 ...
## $ Dec 2 Seller : num 2193 2173 2177 2200 2215 ...
## $ Dec 3 Buyer : num 2208 2188 2192 2215 2230 ...
## $ Dec 3 Seller : num 2213 2193 2197 2220 2235 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Rename "Cash Buyer" to "copper_price"
setnames(copperDT, old="Cash Buyer", new="copper_price")
setnames(cobaltDT, old="Cash Buyer", new="cobalt_price")
setnames(tinDT, old="Cash Buyer", new="tin_price")
setnames(aluminumDT, old="Cash Buyer", new="aluminum_price")
# Convert `"Date"` to POSIXct
copperDT[, close_date := as.POSIXct(Date, tz="UTC")]
cobaltDT[, close_date := as.POSIXct(Date, tz="UTC")]
tinDT[, close_date := as.POSIXct(Date, tz="UTC")]
aluminumDT[, close_date := as.POSIXct(Date, tz="UTC")]
# Create copperDT2 with "close_date" and "copper_price"
copperDT2 <- copperDT[, .(close_date, copper_price)]
cobaltDT2 <- cobaltDT[, .(close_date, cobalt_price)]
tinDT2 <- tinDT[, .(close_date, tin_price)]
aluminumDT2 <- aluminumDT[, .(close_date, aluminum_price)]
# Create a new data.table using .() subsetting
nickelDT2 <- nickelDT[, .(
close_date = as.POSIXct(Date, tz = "UTC"),
nickel_price = `Cash Buyer`
)]
str(nickelDT2)
## Classes 'data.table' and 'data.frame': 1552 obs. of 2 variables:
## $ close_date : POSIXct, format: "2018-03-11 19:00:00" "2018-03-08 18:00:00" ...
## $ nickel_price: num 13720 13335 13240 13350 13575 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Merge copperDT and cobaltDT with merge()
mergedDT <- merge(cobaltDT2, copperDT2, by="close_date", all=TRUE)
# Merge five tables into one
mergedDT <- Reduce(f = function(x, y) { merge(x, y, by="close_date", all=TRUE) },
x = list(aluminumDT2, copperDT2, cobaltDT2, nickelDT2, tinDT2)
)
# Function to add differences
add_diffs <- function(DT, cols, ndiff){
for (colname in cols){
new_name <- paste0(colname, "_diff", ndiff)
DT[, (new_name) := get(colname) - shift(get(colname), type = "lag", n = ndiff)]
}
}
# Add 2-period diffs
add_diffs(mergedDT, paste0(c("aluminum", "cobalt", "copper", "nickel", "tin"), "_price"), 2)
# Function to add growth rates
add_growth_rates <- function(DT, cols, ndiff){
for (colname in cols){
new_name <- paste0(colname, "_pctchg", ndiff)
DT[, (new_name) := (get(colname) / shift(get(colname), type = "lag", n = ndiff)) - 1]
}
}
# Add 1-period growth rate
add_growth_rates(mergedDT, paste0(c("aluminum", "cobalt", "copper", "nickel", "tin"), "_price"), 1)
# Function to get correlation matrix from a data.table
corrmat_from_dt <- function(DT, cols){
# Subset to the requested columns
subDT <- DT[, .SD, .SDcols=cols]
subDT <- subDT[complete.cases(subDT)]
return(cor(subDT))
}
# Get correlations of prices
corrmat_from_dt(mergedDT, paste0(c("aluminum", "cobalt", "copper", "nickel", "tin"), "_price"))
## aluminum_price cobalt_price copper_price nickel_price tin_price
## aluminum_price 1.0000000 0.5325989 0.7538137 0.5834202 0.6123461
## cobalt_price 0.5325989 1.0000000 0.8101197 0.9201932 0.6821939
## copper_price 0.7538137 0.8101197 1.0000000 0.8442838 0.7297888
## nickel_price 0.5834202 0.9201932 0.8442838 1.0000000 0.6538355
## tin_price 0.6123461 0.6821939 0.7297888 0.6538355 1.0000000
# Add 1-period first differences
price_cols <- c("aluminum_price", "cobalt_price", "copper_price", "nickel_price", "tin_price")
add_diffs(DT = mergedDT, cols = price_cols, ndiff = 1)
# Rename aluminum first difference to "target"
setnames(mergedDT, "aluminum_price_diff1", "target")
# Add 1-period growth rates
add_growth_rates(DT = mergedDT, cols = price_cols, ndiff = 1)
# Correlation matrix
diff_cols <- grep("_diff", x = names(mergedDT), value = TRUE)
growth_cols <- grep("_pctchg", x = names(mergedDT), value = TRUE)
corrmat_from_dt(DT = mergedDT, cols = c(diff_cols, growth_cols, "target"))[, "target"]
## aluminum_price_diff2 cobalt_price_diff2 copper_price_diff2
## 0.712321999 0.005595258 0.396601022
## nickel_price_diff2 tin_price_diff2 cobalt_price_diff1
## 0.323119963 0.248714962 0.001459352
## copper_price_diff1 nickel_price_diff1 tin_price_diff1
## 0.579055815 0.462822212 0.377950436
## aluminum_price_pctchg1 cobalt_price_pctchg1 copper_price_pctchg1
## 0.993585007 -0.012106589 0.562282577
## nickel_price_pctchg1 tin_price_pctchg1 target
## 0.449428280 0.359260474 1.000000000
# Add 1-period differences
add_diffs(mergedDT, paste0(c("cobalt", "copper", "nickel", "tin"), "_price"), 1)
# Add 3-period growth rates
add_growth_rates(mergedDT, paste0(c("cobalt", "copper", "nickel", "tin"), "_price"), 3)
# Add 12-period difference in nickel price
add_diffs(mergedDT, paste0(c("nickel"), "_price"), 12)
# Top 4 difference / growth columns
top_features <- c("copper_price_diff1", "nickel_price_diff1", "tin_price_diff1", "copper_price_pctchg3")
Chapter 1 - Introduction to plotly
What is plotly?
Univariate Graphics:
count(Type) %>% # create a frequency table plot_ly(x = ~Type, y = ~n) %>% # specify aesthetics (similar to ggplot aes() function) add_bars() # add the bars trace plot_ly(x = ~Phenols) %>% # specify aesthetics add_histogram() # add the histogram trace Bivariate Graphics:
count(type, quality_label) %>% group_by(type) %>% # group the table mutate(prop = n / sum(n)) %>% # calculate the proportions plot_ly(x = ~type, y = ~n, color = ~quality_label) %>% add_bars() %>% layout(barmode = "stack") Example code includes:
vgsales <- readr::read_csv("./RInputFiles/vgsales.csv")
glimpse(vgsales)
# Store the scatterplot of Critic_Score vs. NA_Sales sales in 2016
scatter <- vgsales %>%
filter(Year == 2016) %>%
ggplot(aes(x = NA_Sales, y = Critic_Score)) +
geom_point(alpha = 0.3)
# Convert the scatterplot to a plotly graphic
plotly::ggplotly(scatter)
library(plotly)
# Create a histogram of Critic_Score
vgsales %>%
filter(!is.na(Critic_Score)) %>%
plot_ly(x = ~Critic_Score) %>%
add_histogram()
# Create a histogram of Critic_Score with at most 25 bins
vgsales %>%
filter(!is.na(Critic_Score)) %>%
plot_ly(x = ~Critic_Score) %>%
add_histogram(nbinsx = 25)
# Create a histogram with bins of width 10 between 0 and 100
vgsales %>%
filter(!is.na(Critic_Score)) %>%
plot_ly(x = ~Critic_Score) %>%
add_histogram(xbins = list(start=0, end=100, size=10))
# Create a frequency for Genre
genre_table <- vgsales %>%
count(Genre)
# Reorder the bars for Genre by n
genre_table %>%
filter(!is.na(Genre)) %>%
mutate(Genre = fct_reorder(Genre, n, .desc=TRUE)) %>%
plot_ly(x = ~Genre, y = ~n) %>%
add_bars()
# Create a scatter plot of User_Score against Critic_Score
vgsales %>%
filter(!is.na(Critic_Score) & !is.na(User_Score)) %>%
plot_ly(x=~Critic_Score, y=~User_Score) %>%
add_markers()
# Filter out the 2016 video games
vg2016 <- vgsales %>%
filter(Year == 2016)
# Create a stacked bar chart of Rating by Genre
vg2016 %>%
count(Genre, Rating) %>%
plot_ly(x = ~Genre, y = ~n, color = ~Rating) %>%
add_bars() %>%
layout(barmode = "stack")
# Create boxplots of Global_Sales by Genre for above data
vg2016 %>%
plot_ly(x=~Global_Sales, y=~Genre) %>%
add_boxplot()
Chapter 2 - Styling and Customizing Graphics
Customize Traces:
Thoughtful Use of Color:
Hover Info:
count(Type) %>% plot_ly(x = ~Type, y = ~n, hoverinfo = "y") %>% add_bars() Customizing Layout:
Example code includes:
# Filter out the 2016 video games
vgsales2016 <- vgsales %>%
filter(Year == 2016)
str(vgsales2016)
# Create a histogram of Critic_Score with navy bars that are 50% transparent
vgsales2016 %>%
filter(!is.na(Critic_Score)) %>%
plot_ly(x = ~Critic_Score) %>%
add_histogram(color = I("navy"), opacity = 0.5)
# Change the color of the histogram using a hex code
vgsales2016 %>%
filter(!is.na(Critic_Score)) %>%
plot_ly(x = ~Critic_Score) %>%
add_histogram(color=I("#111e6c"))
# Change the color of the histogram using rgb()
vgsales2016 %>%
filter(!is.na(Critic_Score)) %>%
plot_ly(x = ~Critic_Score) %>%
add_histogram(marker = list(color = "rgb(17, 30, 108)"))
# Set the plotting symbol to diamond and the size to 4
vgsales2016 %>%
filter(!is.na(Critic_Score), !is.na(User_Score)) %>%
plot_ly(x = ~User_Score, y = ~Critic_Score) %>%
add_markers(marker = list(symbol="diamond", size=4))
# Use color to add Genre as a third variable
vgsales2016 %>%
filter(!is.na(Critic_Score), !is.na(User_Score), !is.na(Genre)) %>%
plot_ly(x=~Critic_Score, y=~User_Score, color=~Genre) %>%
add_markers(colors="Dark2")
# Create a scatterplot of User_Score against Critic_Score coded by Rating
vgsales2016 %>%
filter(!is.na(Critic_Score), !is.na(User_Score), !is.na(Rating)) %>%
plot_ly(x=~Critic_Score, y=~User_Score, symbol=~Rating) %>%
add_markers()
# Create a scatterplot of User_Score vs. Critic_Score colored by User_Count
vgsales2016 %>%
filter(!is.na(Critic_Score), !is.na(User_Score), !is.na(User_Count)) %>%
plot_ly(x = ~Critic_Score, y = ~User_Score, color=~User_Count) %>%
add_markers()
# Create a scatterplot of User_Score vs. Critic_Score colored by log User_Count
vgsales2016 %>%
filter(!is.na(Critic_Score), !is.na(User_Score), !is.na(User_Count)) %>%
plot_ly(x = ~Critic_Score, y = ~User_Score, color=~log(User_Count)) %>%
add_markers()
# Create a bar chart of Platform with hoverinfo only for the bar heights
vgsales2016 %>%
filter(!is.na(Platform)) %>%
count(Platform) %>%
plot_ly(x=~Platform, y=~n, hoverinfo="y") %>%
add_bars()
# Create a scatterplot of User_Score vs. Critic score
vgsales2016 %>%
filter(!is.na(Critic_Score), !is.na(User_Score), !is.na(Name)) %>%
# Add video game Name to the hover info text
plot_ly(x=~Critic_Score, y=~User_Score, hoverinfo="text", text=~Name) %>%
add_markers()
# Format the hover info for NA_Sales, EU_Sales, and Name
vgsales2016 %>%
filter(!is.na(NA_Sales), !is.na(EU_Sales), !is.na(Name)) %>%
plot_ly(x = ~NA_Sales, y = ~EU_Sales, hoverinfo = "text",
text = ~paste("NA_Sales:", NA_Sales, "<br>", "EU_Sales:", EU_Sales, "<br>", "Name:", Name)
) %>%
add_markers()
# Polish the scatterplot by transforming the x-axis and labeling both axes
vgsales2016 %>%
filter(!is.na(Global_Sales), !is.na(Critic_Score)) %>%
plot_ly(x = ~Global_Sales, y = ~Critic_Score) %>%
add_markers(marker = list(opacity = 0.5)) %>%
layout(xaxis = list(title="Global sales (millions of units)", type="log"),
yaxis = list(title="Critic score")
)
# Set the background color to #ebebeb and remove the vertical grid
vgsales %>%
filter(!is.na(Year)) %>%
group_by(Year) %>%
summarize(Global_Sales = sum(Global_Sales, na.rm=TRUE)) %>%
plot_ly(x = ~Year, y = ~Global_Sales) %>%
add_lines() %>%
layout(xaxis=list(showgrid=FALSE), paper_bgcolor="#ebebeb")
Chapter 3 - Advanced Charts
Layering Traces:
add_lines(x = ~density1$x, y = ~density1$y, name = "Type 1") %>% add_lines(x = ~density2$x, y = ~density2$y, name = "Type 2") %>% add_lines(x = ~density3$x, y = ~density3$y, name = "Type 3") %>% layout(xaxis = list(title = 'Flavonoids'), yaxis = list(title = 'Density')) Subplots:
group_by(region) %>% do(plot = plot_ly(data = ., x = ~Critic_Score, y = ~User_Score) %>% add_markers(name = ~Genre)) %>% subplot(nrows = 2) Scatterplot Matrices:
Binned Scatterplots:
Example code includes:
vgsales2016 <- vgsales %>%
mutate(User_Score = as.numeric(User_Score)) %>%
filter(Year == 2016, !is.na(User_Score), !is.na(Critic_Score))
str(vgsales2016)
# Fit the regression model of User_Score on Critic_Score
m <- lm(User_Score ~ Critic_Score, data = vgsales2016)
# Create the scatterplot with smoother
vgsales2016 %>%
select(User_Score, Critic_Score) %>%
na.omit() %>%
plot_ly(x = ~Critic_Score, y = ~User_Score) %>%
add_markers(showlegend = FALSE) %>%
add_lines(y = ~fitted(m))
activision <- vgsales2016 %>% filter(Publisher == "Activision")
ea <- vgsales2016 %>% filter(Publisher == "Electronic Arts")
nintendo <- vgsales2016 %>% filter(Publisher == "Nintendo")
# Compute density curves
d.a <- density(activision$Critic_Score, na.rm = TRUE)
d.e <- density(ea$Critic_Score, na.rm = TRUE)
d.n <- density(nintendo$Critic_Score, na.rm = TRUE)
# Overlay density plots
plot_ly() %>%
add_lines(x = ~d.a$x, y = ~d.a$y, name = "Activision", fill = 'tozeroy') %>%
add_lines(x = ~d.e$x, y = ~d.e$y, name = "Electronic Arts", fill = 'tozeroy') %>%
add_lines(x = ~d.n$x, y = ~d.n$y, name = "Nintendo", fill = 'tozeroy') %>%
layout(xaxis = list(title = 'Critic Score'),
yaxis = list(title = 'Density'))
# Create a scatterplot of User_Score against Critic_Score for PS4 games
p1 <- vgsales2016 %>%
filter(Platform == "PS4") %>%
plot_ly(x = ~Critic_Score, y = ~User_Score) %>%
add_markers(name = "PS4")
# Create a scatterplot of User_Score against Critic_Score for XOne games
p2 <- vgsales2016 %>%
filter(Platform == "XOne") %>%
plot_ly(x = ~Critic_Score, y = ~User_Score) %>%
add_markers(name = "XOne")
# Create a facted scatterplot containing p1 and p2
subplot(p1, p2, nrows=2)
# Create a faceted scatterplot of User_Score vs. Critic_Score with 3 rows
vgsales2016 %>%
group_by(Platform) %>%
do(plot = plot_ly(data = ., x=~Critic_Score, y=~User_Score) %>%
add_markers(name = ~Platform)
) %>%
subplot(nrows = 3, shareY = TRUE, shareX = TRUE)
# Add x-axis and y-axis labels, and a title
sp2 <- subplot(p1, p2, nrows = 2, shareX=TRUE, shareY=TRUE) %>%
layout(title="User score vs. critic score by platform, 2016")
sp2
# Add x-axis and y-axis labels, and a title to sp2
sp2 %>%
layout(xaxis = list(title=""), xaxis2 = list(title="Year"),
yaxis = list(title="Global Sales (M units)"), yaxis2 = list(title="Global Sales (M units)")
)
# Create a SPLOM of NA_Sales, EU_Sales, and JP_Sales
vgsales2016 %>%
plot_ly() %>%
add_trace(type = "splom", dimensions = list(list(label = "N. America", values = ~NA_Sales),
list(label = "Europe", values = ~EU_Sales),
list(label = "Japan", values = ~JP_Sales)
)
)
# Color the SPLOM of NA_Sales, EU_Sales, and JP_Sales by nintendo
vgsales2016 %>%
mutate(nintendo = ifelse(Publisher == "Nintendo", "Nintendo", "Other")) %>%
plot_ly(color=~nintendo) %>%
add_trace(type="splom", dimensions = list(list(label = "N. America", values = ~NA_Sales),
list(label = "Europe", values = ~EU_Sales),
list(label = "Japan", values = ~JP_Sales)
)
)
# Delete the diagonal plots in splom
splom %>%
style(diagonal = list(visible=FALSE))
# Delete the plots in the upper half of splom
splom %>%
style(showupperhalf=FALSE)
# Delete the plots in the lower half of splom
splom %>%
style(showlowerhalf=FALSE)
# Create a binned scatterplot of User_Score vs. Critic_Score
vgsales %>%
plot_ly(x=~Critic_Score, y=~User_Score) %>%
add_histogram2d(nbinsx=50, nbinsy=50)
Chapter 4 - Case Study
Introduction to 2018 Election Data:
Choropleth Maps:
plot_geo(locationmode = 'USA-states') %>% add_trace(z = ~turnout, locations = ~state.abbr) %>% layout(geo = list(scope = 'usa')) # restricts map only to USA From Polygons to Maps:
group_by(group) %>% plot_ly( x=~long, y=~lat, color=~turnout2018, split=~region ) %>% add_polygons(line = list(width = 0.4), showlegend = FALSE) Wrap Up:
Example code includes:
turnout <- readr::read_csv("./RInputFiles/TurnoutRates.csv")
str(turnout)
# Create a scatterplot of turnout2018 against turnout2014
p <- turnout %>%
plot_ly(x=~turnout2014, y=~turnout2018) %>%
add_markers() %>%
layout(xaxis = list(title="2014 voter turnout"),
yaxis = list(title="2018 voter turnout")
)
p
# Add the line y = x to the scatterplot
p %>%
add_lines(x = c(0.25, 0.6), y = c(0.25, 0.6)) %>%
layout(showlegend=FALSE)
# Create a dotplot of voter turnout in 2018 by state ordered by turnout
turnout %>%
top_n(15, wt = turnout2018) %>%
plot_ly(x = ~turnout2018, y = ~fct_reorder(state, turnout2018)) %>%
add_markers() %>%
layout(yaxis=list(title="State", type="category"), xaxis=list(title="Elgible voter turnout"))
fundraising <- readr::read_csv("./RInputFiles/fec_candidate_summary_2018.csv")
str(fundraising)
# Create a histogram of receipts for the senate races
fundraising %>%
filter(office=="S") %>%
plot_ly(x=~receipts) %>%
add_histogram() %>%
layout(title="Fundraising for 2018 Senate races", xaxis=list(title="Total contributions received"))
# Create a dotplot of the top 15 Senate campaigns
fundraising %>%
filter(office == "S") %>%
top_n(15, wt = receipts) %>%
plot_ly(x = ~receipts, y = ~fct_reorder(state, receipts), color = ~fct_drop(party),
hoverinfo = "text", text = ~paste("Candidate:", name, "<br>", "Party:", party, "<br>",
"Receipts:", receipts, "<br>",
"Disbursements:", disbursement
)
) %>%
add_markers(colors = c("blue", "red"))
# Create a choropleth map of the change in voter turnout from 2014 to 2018
turnout %>%
mutate(change = turnout2018 - turnout2014) %>%
plot_geo(locationmode = 'USA-states') %>%
add_trace(z=~change, locations=~state.abbr) %>%
layout(geo = list(scope="usa"))
senate_winners <- readr::read_csv("./RInputFiles/senate_winners.csv")
str(senate_winners)
# Create a choropleth map displaying the Senate results
senate_winners %>%
plot_geo(locationmode = "USA-states") %>%
add_trace(z=~as.numeric(as.factor(party)), locations=~state,
colors = c("dodgerblue", "mediumseagreen", "tomato"),
hoverinfo = "text", text = ~paste("Candidate:", name, "<br>",
"Party:", party, "<br>",
"% vote:", round(pct.vote, 1)
)
) %>%
layout(geo = list(scope = 'usa')) %>%
hide_colorbar()
# Map President Trump's rallies in 2018
# rallies2018 %>%
# plot_geo(locationmode = 'USA-states') %>%
# add_markers(x=~long, y=~lat, size=~no.speakers,
# hoverinfo = "text", text = ~paste(city, state, sep = ",")
# ) %>%
# layout(title = "2018 Trump Rallies", geo = list(scope = "usa"))
# Customize the geo layout
g <- list(scope = 'usa',
showland = TRUE, landcolor = "gray90",
showlakes = TRUE, lakecolor = "white",
showsubunit = TRUE, subunitcolor = "white"
)
# Apply the geo layout to the map
# rallies2018 %>%
# plot_geo(locationmode = 'USA-states') %>%
# add_markers(x = ~long, y = ~lat, size = ~no.speakers,
# hoverinfo = "text", text = ~paste(city, state, sep = ",")
# ) %>%
# layout(title = "2018 Trump Rallies", geo = list(scope="usa"))
# Customize the geo layout
g <- list(scope = 'usa',
showland = TRUE, landcolor = toRGB("gray90"),
showlakes = TRUE, lakecolor = toRGB("white"),
showsubunit = TRUE, subunitcolor = toRGB("white")
)
# Apply the geo layout to the map
# rallies2018 %>%
# plot_geo(locationmode = 'USA-states') %>%
# add_markers(x = ~long, y = ~lat, size = ~no.speakers,
# hoverinfo = "text", text = ~paste(city, state, sep = ",")
# ) %>%
# layout(title = "2018 Trump Rallies", geo = g)
fl_boundaries <- readr::read_csv("./RInputFiles/fl_boundaries.csv")
str(fl_boundaries)
fl_results <- readr::read_csv("./RInputFiles/fl_results.csv")
str(fl_results)
# Create a choropleth map displaying the Senate winners
# senate_vote %>%
# group_by(group) %>%
# plot_ly(x=~long, y=~lat, color=~PartyCode, split=~region) %>%
# add_polygons(line = list(width=0.4), showlegend=FALSE)
# Adjust the polygon colors and boundaries
# senate_map %>%
# group_by(group) %>%
# plot_ly(x = ~long, y = ~lat, color = ~party, split = ~region,
# colors=c("dodgerblue", "mediumseagreen", "tomato")
# ) %>%
# add_polygons(line = list(width = 0.4, color=toRGB("gray60")), showlegend = FALSE)
# Define the layout settings to polish the axes
# map_axes <- list(title = "", showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE)
# Apply the layout to both axes
# senate_map %>%
# group_by(group) %>%
# plot_ly(x = ~long, y = ~lat, color = ~party, split = ~region,
# colors = c("dodgerblue", "mediumseagreen", "tomato")
# ) %>%
# add_polygons(line = list(width = 0.4, color = toRGB("gray60")), showlegend = FALSE) %>%
# layout(xaxis=map_axes, yaxis=map_axes)
# Join the fl_boundaries and fl_results data frames
senate_vote <- left_join(fl_boundaries, fl_results, by = c("subregion" = "CountyName"))
# Specify the axis settings to polish the map
map_axes <- list(title = "", showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE)
# Create a polished county-level choropleth map of Pctvote
senate_vote %>%
group_by(group) %>%
plot_ly(x = ~long, y = ~lat, color = ~Pctvote, split = ~subregion) %>%
add_polygons(line = list(width = 0.4), showlegend = FALSE, colors = c("blue", "red")) %>%
layout(xaxis = map_axes, yaxis = map_axes)
Chapter 1 - Introduction to hyperparameters
Parameters vs. Hyperparameters:
Recap of machine learning basics:
Hyperparameter tuning in caret:
Example code includes:
breast_cancer_data <- readr::read_csv("./RInputFiles/breast_cancer_data.csv")
## Parsed with column specification:
## cols(
## diagnosis = col_character(),
## concavity_mean = col_double(),
## symmetry_mean = col_double(),
## fractal_dimension_mean = col_double(),
## perimeter_se = col_double(),
## smoothness_se = col_double(),
## concavity_se = col_double(),
## `concave points_se` = col_double(),
## perimeter_worst = col_double(),
## symmetry_worst = col_double(),
## fractal_dimension_worst = col_double()
## )
str(breast_cancer_data)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 11 variables:
## $ diagnosis : chr "M" "M" "M" "M" ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
## - attr(*, "spec")=
## .. cols(
## .. diagnosis = col_character(),
## .. concavity_mean = col_double(),
## .. symmetry_mean = col_double(),
## .. fractal_dimension_mean = col_double(),
## .. perimeter_se = col_double(),
## .. smoothness_se = col_double(),
## .. concavity_se = col_double(),
## .. `concave points_se` = col_double(),
## .. perimeter_worst = col_double(),
## .. symmetry_worst = col_double(),
## .. fractal_dimension_worst = col_double()
## .. )
# bc_train_data <- readr::read_csv("./RInputFiles/bc_train_data.csv")
# str(bc_train_data)
# Fit a linear model on the breast_cancer_data.
linear_model <- lm(concavity_mean ~ symmetry_mean, data=breast_cancer_data)
# Look at the summary of the linear_model.
summary(linear_model)
##
## Call:
## lm(formula = concavity_mean ~ symmetry_mean, data = breast_cancer_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.201877 -0.039201 -0.008432 0.030655 0.226150
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.15311 0.04086 -3.747 0.000303 ***
## symmetry_mean 1.33366 0.21257 6.274 9.57e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06412 on 98 degrees of freedom
## Multiple R-squared: 0.2866, Adjusted R-squared: 0.2793
## F-statistic: 39.36 on 1 and 98 DF, p-value: 9.575e-09
# Extract the coefficients.
coef(linear_model)
## (Intercept) symmetry_mean
## -0.1531055 1.3336568
# Plot linear relationship.
ggplot(data = breast_cancer_data, aes(x = symmetry_mean, y = concavity_mean)) +
geom_point(color = "grey") +
geom_abline(slope = coef(linear_model)[2], intercept = coef(linear_model)[1])
# Create partition index
index <- caret::createDataPartition(breast_cancer_data$diagnosis, p = 0.7, list = FALSE)
# Subset `breast_cancer_data` with index
bc_train_data <- breast_cancer_data[index, ]
bc_test_data <- breast_cancer_data[-index, ]
# Define 3x5 folds repeated cross-validation
fitControl <- caret::trainControl(method = "repeatedcv", number = 5, repeats = 3)
# Run the train() function
gbm_model <- caret::train(diagnosis ~ ., data = bc_train_data, method="gbm",
trControl=fitControl, verbose = FALSE
)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
## The following object is masked from 'package:purrr':
##
## lift
# Look at the model
gbm_model
## Stochastic Gradient Boosting
##
## 70 samples
## 10 predictors
## 2 classes: 'B', 'M'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 56, 56, 56, 56, 56, 56, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa
## 1 50 0.8809524 0.7619048
## 1 100 0.8857143 0.7714286
## 1 150 0.8761905 0.7523810
## 2 50 0.8809524 0.7619048
## 2 100 0.8714286 0.7428571
## 2 150 0.8761905 0.7523810
## 3 50 0.8809524 0.7619048
## 3 100 0.8809524 0.7619048
## 3 150 0.8857143 0.7714286
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 100, interaction.depth =
## 1, shrinkage = 0.1 and n.minobsinnode = 10.
set.seed(42) # Set seed.
tictoc::tic() # Start timer.
gbm_model <- caret::train(diagnosis ~ ., data = bc_train_data, method = "gbm",
trControl = trainControl(method = "repeatedcv", number = 5, repeats = 3),
verbose = FALSE, tuneLength=4
)
tictoc::toc() # Stop timer.
## 2.61 sec elapsed
# Define hyperparameter grid.
hyperparams <- expand.grid(n.trees = 200, interaction.depth = 1,
shrinkage = 0.1, n.minobsinnode = 10
)
# Apply hyperparameter grid to train().
set.seed(42)
gbm_model <- caret::train(diagnosis ~ ., data = bc_train_data, method = "gbm",
trControl = trainControl(method = "repeatedcv", number = 5, repeats = 3),
verbose = FALSE, tuneGrid=hyperparams
)
Chapter 2 - Hyperparameter Tuning with caret
Hyperparameter tuning in caret:
Grid vs. Random Search:
Adaptive Resampling:
Example code includes:
tgtData <- rep(c("Did not vote", "Voted"), each=40)
vecVoteData <- c(2, 2, 3, 2, 2, 3, 3, 1, 2, 3, 4, 4, 4, 3, 1, 2, 2, 2, 3, 2, 1, 2, 3, 2, 1, 3, 3, 3, 3, 4, 2, 4, 1, 4, 3, 3, 2, 4, 2, 1, 3, 2, 2, 1, 3, 3, 3, 4, 3, 4, 3, 4, 3, 3, 2, 3, 4, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 3, 4, 2, 3, 3, 3, 3, 3, 2, 4, 1, 3, 4, 3, 3, 2, 2, 3, 3, 2, 2, 1, 2, 4, 2, 3, 2, 3, 4, 3, 2, 2, 2, 4, 1, 2, 2, 3, 2, 1, 3, 4, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 3, 4, 1, 4, 4, 1, 3, 4, 4, 2, 2, 3, 3, 3, 2, 3, 1, 1, 2, 2, 3, 2, 2, 3, 2, 2, 2, 2, 2, 4, 2, 2, 2, 1, 4, 2, 2, 3, 3, 4, 2, 1, 1, 3, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 2, 3, 1, 3, 2, 1, 1, 3, 2, 2, 1, 2, 2, 1, 1, 1, 3, 2, 2, 1, 1, 2, 1, 3, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 4, 2, 1, 4, 2, 2, 3, 2, 3, 2, 3, 1, 3, 2, 1, 2, 2, 3, 2, 1, 1, 1, 3, 2, 1, 2, 2, 2, 2, 2, 2, 1, 3, 3, 1, 3, 3, 1, 3, 3, 2, 1, 1, 1, 2, 1, 2, 2, 1, 1, 3, 1, 1, 1, 2, 2, 2, 1, 2, 2, 4, 3, 3, 4, 1, 4, 4, 1, 2, 1, 3, 4, 4, 2, 3, 1, 3, 1, 3, 1, 1, 2, 3, 1, 2, 1, 3, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 4, 1, 1, 3, 1, 2, 2, 2, 2, 3, 1, 1, 2, 3, 2, 2, 1, 3, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1, 4, 3, 3, 2, 1, 1, 2, 3, 3, 1, 2, 3, 2, 2, 3, 2, 3, 3, 1, 1, 2, 2, 2, 2, 2, 3, 1, 3, 2, 2, 3, 4, 3, 2, 3, 3, 2, 3, 2, 3, 2, 2, 1, 4, 1, 3, 3, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 3, 1, 3, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 3, 2, 1, 1, 1, 1, 3, 2, 1, 1, 2, 2, 2, 1, 1, 2, 3, 1, 1, 1, 2, 1, 2, 1, 1, 1, 4, 1, 3, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 4, 2, 2, 1, 2, 2, 3, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 1, 1, 2, 3, 2, 2, 2, 3, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 3, 3, 4, 2, 3, 2, 1, 3, 2, 2, 2, 3, 2, 3, 2, 2, 2, 3, 3, 2, 3, 2, 3, 2, 2, 2, 3, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 3, 1, 2, 2, 2, 2, 3, 1, 2, 2, 3, 2, 2, 2, 3, 2, 3, 3, 2, 2, 3, 2, 2, 2, 2)
vecVoteData <- c(vecVoteData, 2, 3, 1, 2, 2, 2, 3, 2, 4, 1, 1, 2, 2, 2, 3, 4, 3, 4, 2, 2, 3, 2, 2, 4, 1, 1, 3, 4, 2, 4, 3, 3, 2, 4, 3, 3, 2, 2, 1, 3, 3, 1, 2, 2, 1, 1, 1, 1, 3, 2, 2, 1, 2, 2, 3, 3, 2, 1, 3, 2, 3, 3, 1, 3, 1, 2, 1, 2, 3, 3, 2, 3, 3, 2, 4, 3, 1, 2, 2, 3, 1, 1, 3, 3, 2, 2, 1, 2, 3, 1, 1, 2, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 1, 3, 1, 1, 3, 3, 2, 4, 3, 3, 3, 3, 3, 3, 4, 1, 1, 1, 1, 3, 3, 1, 2, 2, 3, 1, 3, 3, 4, 2, 3, 1, 3, 2, 1, 1, 3, 3, 3, 3, 3, 2, 3, 3, 1, 2, 3, 3, 2, 2, 3, 1, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 2, 2, 3, 2, 4, 1, 1, 3, 3, 3, 3, 4, 3, 2, 3, 3, 1, 3, 1, 4, 1, 1, 4, 4, 4, 4, 3, 3, 4, 3, 2, 3, 4, 2, 4, 1, 1, 3, 4, 3, 1, 1, 3, 4, 4, 4, 1, 1, 3, 1, 3, 3, 1, 3, 1, 3, 3, 3, 4, 4, 3, 3, 4, 2, 2, 3, 2, 3, 4, 2, 3, 4, 3, 3, 2, 2, 1, 2, 2, 8, 2, 8, 8, 2, 2, 2, 2, 2, 1, 2, 2, 8, 8, 8, 2, 1, 2, 2, 2, 1, 2, 1, 8, 8, 2, 2, 8, 8, 1, 2, 2, 2, 8, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 8, 2, 1, 1, 1, 1, 2, 1, 1, 2, 8, 2, 1, 1, 2, 1, 2, 1, 3, 8, 3, 3, 1, 3, 2, 2, 3, 2, 1, 3, 3, 3, 3, 3, 3, 8, 3, 3, 3, 3, 3, 2, 3, 3, 2, 2, 3, 3, 3, 2, 3, 8, 2, 3, 8, 3, 2, 3, 2, 3, 3, 3, 3, 2, 3, 1, 3, 1, 8, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 2, 2, 3, 3, 1, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 1, 8, 8, 3, 1, 3, 3, 3, 3, 8, 3, 3, 3, 3, 8, 3, 3, 8, 3, 1, 3, 3, 3, 2, 3, 8, 3, 3, 3, 3, 8, 2, 2, 3, 1, 1, 2, 2, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 8, 2, 1, 1, 1, 2, 2, 2, 2, 8, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 8, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 8, 2)
vecVoteData <- c(vecVoteData, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 8, 2, 1, 2, 2, 1, 2, 2, 2, 8, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 8, 8, 2, 1, 1, 8, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 8, 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 8, 2, 1, 1, 2, 2, 1, 1, 8, 2, 1, 1, 8, 1, 1, 1, 2, 8, 1, 1, 8, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 2, 2, 1, 1, 1, 1, 2, 1, 3, 1, 1, 2, 2, 1, 2, 2, 3, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3, 2, 1, 2, 1, 4, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 3, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 4, 2, 1, 1, 2, 1, 1, 3, 2, 4, 2, 2, 1, 1, 1, 2, 1, 1, 2, 4, 1, 2, 2, 3, 1, 2, 2, 4, 3, 2, 1, 1, 1, 3, 1, 4, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 4, 1, 3, 1, 1, 2, 2, 2, 3, 2, 2, 1, 1, 1, 1, 2, 3, 3, 1, 1, 1, 1, 1, 2, 2, 1, 1, 3, 2, 1, 1, 1, 1, 2, 4, 2, 2, 2, 1, 2, 1, 3, 1, 2, 2, 3, 1, 2, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 1, 2, 3, 2, 1, 3, 1, 2, 2, 1, 3, 2, 2, 1, 2, 3, 4, 1, 1, 2, 3, 1, 1, 1, 2, 3, 4, 2, 1, 4, 2, 2, 2, 2, 1, 2, 3, 1, 1, 2, 1, 1, 4, 1, 1, 4, 1, 4, 3, 3, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 3, 4, 1, 2, 1, 1, 3, 1, 1, 3, 1, 1, 2, 1, 2, 2, 1, 2, 2, 3, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 4, 1, 1, 2, 1, 2, 1, 1, 1, 4, 1, 1, 2, 2, 2, 3, 3, 2, 2, 1, 2, 1, 2, 2, 1, 1, 3, 1, 1, 1, 1, 1, 2, 1, 3, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 3, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1, 2, 3, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 4, 1, 2, 1, 1, 1, 2)
vecVoteData <- c(vecVoteData, 1, 1, 1, 2, 3, 2, 1, 1, 2, 3, 4, 1, 1, 2, 2, 2, 4, 2, 2, 4, 3, 3, 2, 4, 4, 3, 3, 4, 2, 2, 4, 4, 4, 2, 3, 1, 4, 2, 4, 4, 3, 3, 1, 4, 3, 3, 3, 1, 3, 2, 1, 2, 1, 1, 3, 2, 4, 2, 3, 2, 2, 1, 4, 3, 3, 3, 1, 2, 3, 3, 1, 3, 4, 4, 4, 3, 3, 4, 1, 1, 2, 4, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 1, 3, 1, 2, 2, 1, 3, 2, 1, 1, 1, 1, 3, 1, 1, 1, 3, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 2, 2, 3, 2, 1, 2, 2, 1, 2, 1, 2, 1, 3, 1, 2, 2, 4, 1, 1, 1, 2, 1, 1, 1, 1, 3, 2, 3, 2, 2, 1, 2, 1, 3, 2, 1, 3, 1, 1, 4, 2, 2, 1, 3, 2, 2, 3, 2, 1, 2, 2, 2, 3, 2, 2, 1, 1, 4, 1, 1, 2, 4, 2, 3, 2, 2, 2, 4, 4, 2, 1, 1, 2, 1, 2, 2, 2, 3, 3, 3, 4, 2, 4, 3, 1, 4, 3, 3, 2, 2, 2, 2, 3, 1, 2, 2, 4, 1, 1, 3, 4, 1, 1, 1, 3, 2, 4, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 4, 1, 2, 1, 4, 1, 3, 1, 3, 1, 1, 1, 3, 2, 2, 2, 1, 1, 2, 1, 1, 2, 3, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 4, 2, 1, 2, 2, 4, 2, 1, 3, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 3, 4, 2, 1, 1, 3, 2, 2, 1, 1, 1, 1, 2, 3, 3, 1, 1, 1, 1, 1, 4, 2, 1, 1, 4, 3, 1, 3, 1, 1, 2, 4, 3, 1, 4, 1, 2, 1, 3, 1, 2, 2, 4, 1, 2, 3, 1, 4, 1, 2, 4, 1, 1, 1, 1, 1, 1, 2, 3, 1, 4, 1, 4, 4, 2, 1, 4, 1, 4, 2, 2, 4, 2, 3, 1, 4, 3, 4, 3, 1, 3, 4, 1, 1, 1, 3, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 4, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 4, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 4, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 4, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 3, 1, 1, 2, 2, 2, 2, 1, 1, 2, 3, 3, 1, 2, 2, 4, 1, 2, 3, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 3, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 3, 2, 2, 1, 2, 1, 2, 2, 3, 2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 3, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 3, 1, 3, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1)
vecVoteData <- c(vecVoteData, 1, 1, 2, 1, 4, 2, 2, 2, 2, 1, 2, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 2, 1, 2, 3, 1, 3, 1, 4, 1, 3, 3, 3, 2, 2, 2, 3, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 3, 1, 1, 1, 1, 4, 1, 2, 2, 4, 3, 1, 3, 2, 2, 1, 2, 2, 2, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 4, 2, 2, 2, 3, 1, 2, 2, 4, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 3, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 3, 1, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 3, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 4, 1, 3, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 4, 4, 3, 1, 4, 2, 2, 2, 3, 1, 1, 2, 1, 1, 4, 4, 1, 4, 3, 2, 1, 2, 4, 3, 3, 4, 1, 2, 1, 2, 2, 3, 3, 1, 4, 3, 3, 4, 3, 4, 1, 1, 3, 3, 1, 1, 3, 1, 1, 3, 1, 3, 3, 3, 3, 4, 4, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2, 1, 2, 1, 4, 1, 1, 1, 2, 1, 4, 3, 4, 4, 2, 2, 2, 3, 1, 2, 1, 2, 2, 3, 3, 2, 1, 1, 2, 1, 1, 1, 1, 3, 1, 1, 3, 2, 2, 1, 1, 1, 2, 1, 3, 2, 1, 1, 2, 1, 1, 3, 3, 2, 2, 2, 2, 1, 3, 3, 4, 3, 3, 3, 3, 1, 3, 2, 3, 1, 1, 2, 2, 1, 3, 1, 2, 2, 1, 1, 1, 3, 1, 1, 1, 3, 4, 1, 1, 2, 3, 3, 2, 2, 2, 2, 1, 2, 1, 4, 3, 3, 1, 1, 2, 1, 1, 4, 1, 1, 4, 2, 3, 2, 3, 3, 3, 2, 2, 2, 3, 1, 1, 1, 1, 3, 1, 2, 2, 3, 4, 3, 1, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 3, 2, 3, 3, 3, 4, 2, 1, 3, 2, 2, 1, 2, 2, 1, 3, 1, 2, 4, 4, 2, 1, 1, 3, 1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 3, 1, 1, 2, 2, 1, 3, 1, 1, 4, 3, 3, 2, 4, 3, 3, 2, 2, 1, 2, 3, 2, 2, 2, 3, 1, 2, 2, 3, 2, 3, 2, 1, 4, 2, 3, 1, 1, 1, 2, 1, 1, 2, 2, 4, 1, 3, 4, 3, 4, 2, 1, 4, 2, 3, 2, 1, 2, 2, 4, 1, 2, 4, 4, 3, 3, 3, 4, 1, 2, 1, 2)
voters_train_data <- tibble(turnout16_2016=tgtData) %>%
bind_cols(as.data.frame(matrix(vecVoteData, nrow=80, byrow=FALSE)))
names(voters_train_data) <- c('turnout16_2016', 'RIGGED_SYSTEM_1_2016', 'RIGGED_SYSTEM_2_2016', 'RIGGED_SYSTEM_3_2016', 'RIGGED_SYSTEM_4_2016', 'RIGGED_SYSTEM_5_2016', 'RIGGED_SYSTEM_6_2016', 'track_2016', 'persfinretro_2016', 'econtrend_2016', 'Americatrend_2016', 'futuretrend_2016', 'wealth_2016', 'values_culture_2016', 'US_respect_2016', 'trustgovt_2016', 'trust_people_2016', 'helpful_people_2016', 'fair_people_2016', 'imiss_a_2016', 'imiss_b_2016', 'imiss_c_2016', 'imiss_d_2016', 'imiss_e_2016', 'imiss_f_2016', 'imiss_g_2016', 'imiss_h_2016', 'imiss_i_2016', 'imiss_k_2016', 'imiss_l_2016', 'imiss_m_2016', 'imiss_n_2016', 'imiss_o_2016', 'imiss_p_2016', 'imiss_r_2016', 'imiss_s_2016', 'imiss_t_2016', 'imiss_u_2016', 'imiss_x_2016', 'imiss_y_2016')
glimpse(voters_train_data)
## Observations: 80
## Variables: 40
## $ turnout16_2016 <chr> "Did not vote", "Did not vote", "Did not ...
## $ RIGGED_SYSTEM_1_2016 <dbl> 2, 2, 3, 2, 2, 3, 3, 1, 2, 3, 4, 4, 4, 3,...
## $ RIGGED_SYSTEM_2_2016 <dbl> 3, 3, 2, 2, 3, 3, 2, 2, 1, 2, 4, 2, 3, 2,...
## $ RIGGED_SYSTEM_3_2016 <dbl> 1, 1, 3, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2,...
## $ RIGGED_SYSTEM_4_2016 <dbl> 2, 1, 2, 2, 2, 2, 2, 2, 1, 3, 3, 1, 3, 3,...
## $ RIGGED_SYSTEM_5_2016 <dbl> 1, 2, 2, 2, 2, 3, 1, 1, 2, 3, 2, 2, 1, 3,...
## $ RIGGED_SYSTEM_6_2016 <dbl> 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 3, 1, 3,...
## $ track_2016 <dbl> 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1,...
## $ persfinretro_2016 <dbl> 2, 2, 2, 2, 1, 2, 2, 2, 3, 2, 3, 2, 2, 2,...
## $ econtrend_2016 <dbl> 2, 2, 2, 3, 1, 2, 2, 2, 3, 2, 4, 1, 1, 2,...
## $ Americatrend_2016 <dbl> 2, 3, 1, 1, 3, 3, 2, 2, 1, 2, 3, 1, 1, 2,...
## $ futuretrend_2016 <dbl> 3, 3, 3, 4, 4, 3, 2, 2, 3, 2, 4, 1, 1, 3,...
## $ wealth_2016 <dbl> 2, 2, 1, 2, 2, 8, 2, 8, 8, 2, 2, 2, 2, 2,...
## $ values_culture_2016 <dbl> 3, 8, 3, 3, 1, 3, 2, 2, 3, 2, 1, 3, 3, 3,...
## $ US_respect_2016 <dbl> 3, 3, 3, 2, 3, 3, 2, 3, 1, 8, 8, 3, 1, 3,...
## $ trustgovt_2016 <dbl> 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3,...
## $ trust_people_2016 <dbl> 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 1, 1,...
## $ helpful_people_2016 <dbl> 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2,...
## $ fair_people_2016 <dbl> 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1,...
## $ imiss_a_2016 <dbl> 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1,...
## $ imiss_b_2016 <dbl> 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1,...
## $ imiss_c_2016 <dbl> 3, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1,...
## $ imiss_d_2016 <dbl> 2, 2, 1, 1, 1, 1, 2, 3, 3, 1, 1, 1, 1, 1,...
## $ imiss_e_2016 <dbl> 3, 4, 2, 1, 4, 2, 2, 2, 2, 1, 2, 3, 1, 1,...
## $ imiss_f_2016 <dbl> 2, 2, 2, 3, 3, 2, 2, 1, 2, 1, 2, 2, 1, 1,...
## $ imiss_g_2016 <dbl> 1, 2, 3, 2, 1, 1, 2, 3, 4, 1, 1, 2, 2, 2,...
## $ imiss_h_2016 <dbl> 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1,...
## $ imiss_i_2016 <dbl> 3, 2, 3, 2, 2, 1, 2, 1, 3, 2, 1, 3, 1, 1,...
## $ imiss_k_2016 <dbl> 2, 4, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1,...
## $ imiss_l_2016 <dbl> 2, 2, 1, 1, 1, 1, 2, 3, 3, 1, 1, 1, 1, 1,...
## $ imiss_m_2016 <dbl> 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1,...
## $ imiss_n_2016 <dbl> 2, 2, 1, 4, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1,...
## $ imiss_o_2016 <dbl> 2, 1, 2, 2, 1, 3, 2, 1, 1, 1, 2, 1, 1, 1,...
## $ imiss_p_2016 <dbl> 4, 2, 2, 2, 2, 1, 2, 1, 2, 1, 3, 1, 1, 1,...
## $ imiss_r_2016 <dbl> 3, 2, 2, 1, 2, 2, 2, 1, 1, 2, 3, 1, 1, 1,...
## $ imiss_s_2016 <dbl> 1, 1, 2, 1, 1, 2, 3, 1, 1, 1, 1, 1, 2, 1,...
## $ imiss_t_2016 <dbl> 4, 4, 3, 1, 4, 2, 2, 2, 3, 1, 1, 2, 1, 1,...
## $ imiss_u_2016 <dbl> 4, 2, 2, 2, 3, 1, 2, 1, 2, 2, 3, 3, 2, 1,...
## $ imiss_x_2016 <dbl> 2, 2, 2, 1, 2, 1, 4, 3, 3, 1, 1, 2, 1, 1,...
## $ imiss_y_2016 <dbl> 2, 2, 2, 2, 2, 1, 2, 2, 3, 1, 1, 2, 2, 1,...
# Define Cartesian grid
man_grid <- expand.grid(degree = c(1, 2, 3), scale = c(0.1, 0.01, 0.001), C = 0.5)
fitControl <- caret::trainControl(method = "repeatedcv", number = 3, repeats = 5)
# Start timer, set seed & train model
tictoc::tic()
set.seed(42)
svm_model_voters_grid <- caret::train(turnout16_2016 ~ ., data = voters_train_data, method = "svmPoly",
trControl = fitControl, verbose= FALSE, tuneGrid = man_grid
)
tictoc::toc()
## 5.94 sec elapsed
# Plot default
plot(svm_model_voters_grid)
# Plot Kappa level-plot
plot(svm_model_voters_grid, metric = "Kappa", plotType = "level")
# Define the grid with hyperparameter ranges
big_grid <- expand.grid(size = seq(from = 1, to = 5, by = 1), decay = c(0, 1))
# Train control with grid search
fitControl <- caret::trainControl(method = "repeatedcv", number = 3, repeats = 5, search = "grid")
# Train neural net
tictoc::tic()
set.seed(42)
nn_model_voters_big_grid <- caret::train(turnout16_2016 ~ ., data = voters_train_data,
method = "nnet", trControl = fitControl, verbose = FALSE
)
## # weights: 42
## initial value 37.543902
## iter 10 value 31.027084
## iter 20 value 11.826996
## iter 30 value 11.383068
## iter 40 value 11.369768
## iter 50 value 11.368247
## iter 60 value 11.368065
## final value 11.368020
## converged
## # weights: 124
## initial value 38.770476
## iter 10 value 33.047592
## iter 20 value 33.024777
## iter 30 value 32.288188
## iter 40 value 31.181895
## iter 50 value 31.163093
## iter 60 value 30.251351
## iter 70 value 29.240243
## iter 80 value 29.237457
## iter 90 value 29.237339
## final value 29.237329
## converged
## # weights: 206
## initial value 37.128261
## iter 10 value 24.521037
## iter 20 value 9.651809
## iter 30 value 8.073319
## iter 40 value 8.054354
## iter 50 value 8.052508
## iter 60 value 8.052367
## iter 70 value 8.052246
## final value 8.052241
## converged
## # weights: 42
## initial value 37.768117
## iter 10 value 29.235462
## iter 20 value 15.314412
## iter 30 value 11.683048
## iter 40 value 11.631211
## final value 11.631175
## converged
## # weights: 124
## initial value 37.897167
## iter 10 value 22.263986
## iter 20 value 14.958578
## iter 30 value 12.341855
## iter 40 value 11.615990
## iter 50 value 10.287985
## iter 60 value 9.817803
## iter 70 value 9.445922
## iter 80 value 9.250218
## iter 90 value 9.151521
## iter 100 value 9.149065
## final value 9.149065
## stopped after 100 iterations
## # weights: 206
## initial value 44.230229
## iter 10 value 22.896820
## iter 20 value 13.420811
## iter 30 value 10.602282
## iter 40 value 9.648477
## iter 50 value 9.206581
## iter 60 value 9.107379
## iter 70 value 8.930251
## iter 80 value 8.762096
## iter 90 value 8.745108
## iter 100 value 8.733854
## final value 8.733854
## stopped after 100 iterations
## # weights: 42
## initial value 38.599223
## iter 10 value 36.013733
## iter 20 value 28.650112
## iter 30 value 25.626085
## iter 40 value 25.102655
## iter 50 value 25.099438
## iter 60 value 25.096932
## iter 70 value 25.093795
## iter 80 value 20.499773
## iter 90 value 11.863452
## iter 100 value 8.588471
## final value 8.588471
## stopped after 100 iterations
## # weights: 124
## initial value 37.369941
## iter 10 value 15.908077
## iter 20 value 11.939921
## iter 30 value 11.443831
## iter 40 value 10.998310
## iter 50 value 8.618822
## iter 60 value 8.585574
## iter 70 value 8.580485
## iter 80 value 8.573734
## iter 90 value 8.568160
## iter 100 value 8.563467
## final value 8.563467
## stopped after 100 iterations
## # weights: 206
## initial value 40.450951
## iter 10 value 30.381579
## iter 20 value 14.285982
## iter 30 value 14.254171
## iter 40 value 14.246870
## iter 50 value 11.470487
## iter 60 value 11.389585
## iter 70 value 11.077604
## iter 80 value 6.484972
## iter 90 value 6.052391
## iter 100 value 4.980032
## final value 4.980032
## stopped after 100 iterations
## # weights: 42
## initial value 40.597417
## iter 10 value 19.009959
## iter 20 value 14.390042
## iter 30 value 13.844423
## iter 40 value 13.843837
## final value 13.843836
## converged
## # weights: 124
## initial value 39.052462
## iter 10 value 19.231930
## iter 20 value 12.458882
## iter 30 value 11.315982
## iter 40 value 11.220401
## iter 50 value 11.208265
## iter 60 value 11.207881
## final value 11.207879
## converged
## # weights: 206
## initial value 38.717452
## iter 10 value 17.154775
## iter 20 value 7.114946
## iter 30 value 3.931071
## iter 40 value 3.561011
## iter 50 value 3.530411
## iter 60 value 3.508046
## iter 70 value 3.035399
## iter 80 value 2.427074
## iter 90 value 1.397239
## iter 100 value 1.390372
## final value 1.390372
## stopped after 100 iterations
## # weights: 42
## initial value 40.195727
## iter 10 value 25.333235
## iter 20 value 15.077790
## iter 30 value 14.172502
## iter 40 value 12.110285
## iter 50 value 11.963469
## final value 11.962828
## converged
## # weights: 124
## initial value 38.884663
## iter 10 value 18.872263
## iter 20 value 11.484779
## iter 30 value 10.516289
## iter 40 value 10.038859
## iter 50 value 9.832094
## iter 60 value 9.745810
## iter 70 value 9.745120
## iter 70 value 9.745120
## iter 70 value 9.745120
## final value 9.745120
## converged
## # weights: 206
## initial value 50.461538
## iter 10 value 25.068375
## iter 20 value 16.051882
## iter 30 value 11.956054
## iter 40 value 11.512066
## iter 50 value 10.468311
## iter 60 value 9.842082
## iter 70 value 9.629081
## iter 80 value 9.497557
## iter 90 value 9.260460
## iter 100 value 9.202350
## final value 9.202350
## stopped after 100 iterations
## # weights: 42
## initial value 37.451965
## iter 10 value 29.589302
## iter 20 value 28.439442
## iter 30 value 28.427382
## iter 40 value 28.253599
## iter 50 value 24.347969
## iter 60 value 20.469627
## iter 70 value 18.572164
## iter 80 value 17.333765
## iter 90 value 14.714238
## iter 100 value 13.738015
## final value 13.738015
## stopped after 100 iterations
## # weights: 124
## initial value 42.709113
## iter 10 value 11.639490
## iter 20 value 2.520854
## iter 30 value 0.267160
## iter 40 value 0.243169
## iter 50 value 0.206714
## iter 60 value 0.168213
## iter 70 value 0.133088
## iter 80 value 0.122380
## iter 90 value 0.112877
## iter 100 value 0.108669
## final value 0.108669
## stopped after 100 iterations
## # weights: 206
## initial value 37.506941
## iter 10 value 18.604849
## iter 20 value 13.069905
## iter 30 value 5.643703
## iter 40 value 0.263280
## iter 50 value 0.173518
## iter 60 value 0.149607
## iter 70 value 0.139005
## iter 80 value 0.128954
## iter 90 value 0.118010
## iter 100 value 0.109979
## final value 0.109979
## stopped after 100 iterations
## # weights: 42
## initial value 36.889615
## iter 10 value 36.727384
## final value 36.727366
## converged
## # weights: 124
## initial value 51.278943
## iter 10 value 34.993438
## iter 20 value 15.486490
## iter 30 value 8.743606
## iter 40 value 6.289637
## iter 50 value 3.910378
## iter 60 value 3.821632
## iter 70 value 3.819593
## iter 80 value 3.819228
## iter 90 value 3.819126
## iter 100 value 3.819115
## final value 3.819115
## stopped after 100 iterations
## # weights: 206
## initial value 39.581972
## iter 10 value 14.017652
## iter 20 value 9.446853
## iter 30 value 9.419187
## iter 40 value 9.418467
## final value 9.418467
## converged
## # weights: 42
## initial value 37.977920
## iter 10 value 34.764588
## iter 20 value 19.209881
## iter 30 value 11.393677
## iter 40 value 10.575455
## iter 50 value 10.573403
## final value 10.573349
## converged
## # weights: 124
## initial value 39.520958
## iter 10 value 34.772321
## iter 20 value 21.774539
## iter 30 value 11.896766
## iter 40 value 10.204592
## iter 50 value 9.469549
## iter 60 value 9.260529
## iter 70 value 9.150984
## iter 80 value 9.097132
## iter 90 value 9.050215
## iter 100 value 8.478206
## final value 8.478206
## stopped after 100 iterations
## # weights: 206
## initial value 45.439219
## iter 10 value 29.575029
## iter 20 value 13.078133
## iter 30 value 9.312289
## iter 40 value 8.551431
## iter 50 value 8.091341
## iter 60 value 7.943568
## iter 70 value 7.697956
## iter 80 value 7.591779
## iter 90 value 7.495377
## iter 100 value 7.485379
## final value 7.485379
## stopped after 100 iterations
## # weights: 42
## initial value 38.828766
## iter 10 value 12.175648
## iter 20 value 11.976102
## iter 30 value 11.973690
## iter 40 value 11.964421
## iter 50 value 9.888034
## iter 60 value 9.799411
## iter 70 value 9.795651
## iter 80 value 9.794210
## iter 90 value 9.786879
## iter 100 value 9.784080
## final value 9.784080
## stopped after 100 iterations
## # weights: 124
## initial value 39.205931
## iter 10 value 19.548574
## iter 20 value 17.282724
## iter 30 value 13.141551
## iter 40 value 12.064310
## iter 50 value 9.727721
## iter 60 value 8.105476
## iter 70 value 7.688740
## iter 80 value 7.676364
## iter 90 value 7.134801
## iter 100 value 5.638429
## final value 5.638429
## stopped after 100 iterations
## # weights: 206
## initial value 37.300577
## iter 10 value 16.083120
## iter 20 value 1.085241
## iter 30 value 0.201556
## iter 40 value 0.150128
## iter 50 value 0.135514
## iter 60 value 0.129346
## iter 70 value 0.121834
## iter 80 value 0.113025
## iter 90 value 0.110736
## iter 100 value 0.101419
## final value 0.101419
## stopped after 100 iterations
## # weights: 42
## initial value 38.937284
## final value 37.429948
## converged
## # weights: 124
## initial value 41.552094
## iter 10 value 14.662688
## iter 20 value 5.470405
## iter 30 value 4.335065
## iter 40 value 0.407672
## iter 50 value 0.036928
## iter 60 value 0.000289
## final value 0.000075
## converged
## # weights: 206
## initial value 41.051035
## iter 10 value 23.865159
## iter 20 value 14.354181
## iter 30 value 14.264236
## iter 40 value 13.462573
## iter 50 value 9.627750
## iter 60 value 8.761932
## iter 70 value 8.741738
## iter 80 value 8.432078
## iter 90 value 8.429836
## iter 100 value 8.244398
## final value 8.244398
## stopped after 100 iterations
## # weights: 42
## initial value 39.387555
## iter 10 value 17.697016
## iter 20 value 11.707385
## iter 30 value 11.217781
## iter 40 value 11.216432
## iter 40 value 11.216432
## iter 40 value 11.216432
## final value 11.216432
## converged
## # weights: 124
## initial value 40.926399
## iter 10 value 36.653859
## iter 20 value 13.671915
## iter 30 value 9.805653
## iter 40 value 8.968733
## iter 50 value 8.736463
## iter 60 value 8.643900
## iter 70 value 8.642037
## final value 8.642037
## converged
## # weights: 206
## initial value 43.533859
## iter 10 value 32.551958
## iter 20 value 17.736477
## iter 30 value 11.407724
## iter 40 value 9.654588
## iter 50 value 8.905511
## iter 60 value 8.639356
## iter 70 value 8.587082
## iter 80 value 8.516321
## iter 90 value 8.423123
## iter 100 value 8.293314
## final value 8.293314
## stopped after 100 iterations
## # weights: 42
## initial value 37.499197
## iter 10 value 23.237414
## iter 20 value 22.899398
## iter 30 value 22.897209
## iter 40 value 22.892996
## iter 50 value 22.889618
## iter 60 value 22.888157
## iter 70 value 22.887123
## iter 80 value 21.784339
## iter 90 value 14.268243
## iter 100 value 9.811080
## final value 9.811080
## stopped after 100 iterations
## # weights: 124
## initial value 37.177016
## iter 10 value 26.669062
## iter 20 value 14.047657
## iter 30 value 11.844210
## iter 40 value 10.492051
## iter 50 value 6.974132
## iter 60 value 6.875568
## iter 70 value 6.866758
## iter 80 value 6.864666
## iter 90 value 6.862890
## iter 100 value 6.859980
## final value 6.859980
## stopped after 100 iterations
## # weights: 206
## initial value 47.779450
## iter 10 value 25.407230
## iter 20 value 16.264920
## iter 30 value 14.094346
## iter 40 value 5.777783
## iter 50 value 4.420906
## iter 60 value 4.413625
## iter 70 value 4.405804
## iter 80 value 4.393388
## iter 90 value 4.384209
## iter 100 value 4.377950
## final value 4.377950
## stopped after 100 iterations
## # weights: 42
## initial value 36.121674
## iter 10 value 18.765032
## iter 20 value 13.718974
## iter 30 value 13.696090
## iter 40 value 13.695914
## final value 13.695911
## converged
## # weights: 124
## initial value 36.875709
## iter 10 value 20.945106
## iter 20 value 10.637618
## iter 30 value 7.942709
## iter 40 value 7.567564
## iter 50 value 7.564512
## iter 60 value 7.563890
## iter 70 value 7.563807
## final value 7.563792
## converged
## # weights: 206
## initial value 38.105752
## iter 10 value 17.942875
## iter 20 value 6.806510
## iter 30 value 6.497444
## iter 40 value 6.171895
## iter 50 value 5.976522
## iter 60 value 1.575089
## iter 70 value 1.418107
## iter 80 value 1.387946
## iter 90 value 1.386482
## iter 100 value 1.386386
## final value 1.386386
## stopped after 100 iterations
## # weights: 42
## initial value 38.438385
## iter 10 value 32.251851
## iter 20 value 24.110705
## iter 30 value 18.306974
## iter 40 value 12.034778
## iter 50 value 11.124601
## iter 60 value 11.117284
## final value 11.117281
## converged
## # weights: 124
## initial value 38.235688
## iter 10 value 22.223623
## iter 20 value 14.849084
## iter 30 value 12.806201
## iter 40 value 10.501168
## iter 50 value 9.341200
## iter 60 value 9.024103
## iter 70 value 8.795149
## iter 80 value 8.762934
## iter 90 value 8.683522
## iter 100 value 8.673141
## final value 8.673141
## stopped after 100 iterations
## # weights: 206
## initial value 46.845663
## iter 10 value 22.639420
## iter 20 value 11.550138
## iter 30 value 9.365575
## iter 40 value 8.662270
## iter 50 value 8.524984
## iter 60 value 8.311540
## iter 70 value 8.230576
## iter 80 value 8.203278
## iter 90 value 8.177681
## iter 100 value 8.174481
## final value 8.174481
## stopped after 100 iterations
## # weights: 42
## initial value 36.424515
## final value 36.044322
## converged
## # weights: 124
## initial value 37.485456
## iter 10 value 21.269622
## iter 20 value 14.136111
## iter 30 value 11.904381
## iter 40 value 11.393983
## iter 50 value 11.387154
## iter 60 value 11.383586
## iter 70 value 6.144184
## iter 80 value 4.321966
## iter 90 value 4.240870
## iter 100 value 4.234243
## final value 4.234243
## stopped after 100 iterations
## # weights: 206
## initial value 36.805335
## iter 10 value 15.575051
## iter 20 value 11.381900
## iter 30 value 11.368671
## iter 40 value 11.365465
## iter 50 value 11.363026
## iter 60 value 11.360019
## iter 70 value 11.357496
## iter 80 value 8.277973
## iter 90 value 5.540749
## iter 100 value 5.527754
## final value 5.527754
## stopped after 100 iterations
## # weights: 42
## initial value 46.308352
## iter 10 value 29.765087
## iter 20 value 29.353187
## final value 29.352122
## converged
## # weights: 124
## initial value 37.622206
## iter 10 value 31.921379
## iter 20 value 30.208674
## iter 30 value 26.196009
## iter 40 value 20.526759
## iter 50 value 18.869224
## iter 60 value 16.103252
## iter 70 value 9.806161
## iter 80 value 7.304875
## iter 90 value 7.280633
## iter 100 value 7.278884
## final value 7.278884
## stopped after 100 iterations
## # weights: 206
## initial value 45.092888
## iter 10 value 21.629442
## iter 20 value 1.723894
## iter 30 value 0.103299
## iter 40 value 0.015382
## iter 50 value 0.002185
## iter 60 value 0.000641
## iter 70 value 0.000324
## iter 80 value 0.000152
## final value 0.000086
## converged
## # weights: 42
## initial value 39.616814
## iter 10 value 37.050260
## iter 20 value 17.163941
## iter 30 value 13.883452
## iter 40 value 11.172521
## iter 50 value 11.153383
## final value 11.153374
## converged
## # weights: 124
## initial value 39.710930
## iter 10 value 20.886940
## iter 20 value 10.473511
## iter 30 value 9.368545
## iter 40 value 8.795251
## iter 50 value 8.619087
## iter 60 value 8.555874
## iter 70 value 8.554889
## final value 8.554888
## converged
## # weights: 206
## initial value 43.203719
## iter 10 value 20.936085
## iter 20 value 11.155461
## iter 30 value 9.174556
## iter 40 value 8.861554
## iter 50 value 8.640300
## iter 60 value 8.575960
## iter 70 value 8.557977
## iter 80 value 8.376927
## iter 90 value 8.261648
## iter 100 value 8.143604
## final value 8.143604
## stopped after 100 iterations
## # weights: 42
## initial value 38.328514
## iter 10 value 27.984087
## iter 20 value 11.544730
## iter 30 value 9.863053
## iter 40 value 9.856206
## iter 50 value 9.832277
## iter 60 value 9.161072
## iter 70 value 7.356195
## iter 80 value 7.344764
## iter 90 value 7.337085
## iter 100 value 7.330234
## final value 7.330234
## stopped after 100 iterations
## # weights: 124
## initial value 40.702488
## iter 10 value 18.257343
## iter 20 value 9.106661
## iter 30 value 9.036648
## iter 40 value 9.010428
## iter 50 value 8.999392
## iter 60 value 8.986183
## iter 70 value 7.400810
## iter 80 value 6.680697
## iter 90 value 6.670819
## iter 100 value 6.667999
## final value 6.667999
## stopped after 100 iterations
## # weights: 206
## initial value 48.166884
## iter 10 value 21.531674
## iter 20 value 18.933456
## iter 30 value 18.505510
## iter 40 value 14.353028
## iter 50 value 11.563237
## iter 60 value 11.515029
## iter 70 value 11.510734
## iter 80 value 11.503964
## iter 90 value 11.493780
## iter 100 value 11.113541
## final value 11.113541
## stopped after 100 iterations
## # weights: 42
## initial value 37.070020
## iter 10 value 36.033776
## iter 20 value 31.057842
## iter 30 value 30.645401
## final value 30.644961
## converged
## # weights: 124
## initial value 36.756406
## iter 10 value 28.161336
## iter 20 value 8.991627
## iter 30 value 4.300812
## iter 40 value 4.206734
## iter 50 value 3.754602
## iter 60 value 0.254742
## iter 70 value 0.015670
## iter 80 value 0.003142
## iter 90 value 0.000158
## final value 0.000077
## converged
## # weights: 206
## initial value 37.742855
## iter 10 value 24.910100
## iter 20 value 8.129825
## iter 30 value 0.257190
## iter 40 value 0.016538
## iter 50 value 0.003799
## iter 60 value 0.001863
## iter 70 value 0.000615
## final value 0.000087
## converged
## # weights: 42
## initial value 37.399772
## iter 10 value 30.633711
## iter 20 value 18.937725
## iter 30 value 14.866094
## iter 40 value 11.895688
## iter 50 value 11.704900
## final value 11.704626
## converged
## # weights: 124
## initial value 38.484282
## iter 10 value 20.586892
## iter 20 value 13.316343
## iter 30 value 10.856966
## iter 40 value 10.186980
## iter 50 value 9.947220
## iter 60 value 9.848237
## iter 70 value 9.640962
## iter 80 value 9.496411
## iter 90 value 9.367648
## iter 100 value 9.343362
## final value 9.343362
## stopped after 100 iterations
## # weights: 206
## initial value 45.398613
## iter 10 value 22.888823
## iter 20 value 13.960342
## iter 30 value 11.261294
## iter 40 value 10.114453
## iter 50 value 9.493566
## iter 60 value 9.237917
## iter 70 value 9.026626
## iter 80 value 8.952669
## iter 90 value 8.933259
## iter 100 value 8.932126
## final value 8.932126
## stopped after 100 iterations
## # weights: 42
## initial value 37.146249
## iter 10 value 20.968920
## iter 20 value 13.902538
## iter 30 value 9.512966
## iter 40 value 8.545659
## iter 50 value 8.530392
## iter 60 value 8.524400
## iter 70 value 8.523511
## iter 80 value 8.523200
## iter 90 value 8.522979
## iter 100 value 8.522408
## final value 8.522408
## stopped after 100 iterations
## # weights: 124
## initial value 38.293741
## iter 10 value 21.928029
## iter 20 value 16.059454
## iter 30 value 15.527439
## iter 40 value 15.029942
## iter 50 value 11.975757
## iter 60 value 9.742379
## iter 70 value 9.729056
## iter 80 value 9.311793
## iter 90 value 7.287283
## iter 100 value 7.282690
## final value 7.282690
## stopped after 100 iterations
## # weights: 206
## initial value 40.289151
## iter 10 value 18.896990
## iter 20 value 8.502690
## iter 30 value 6.255260
## iter 40 value 6.104334
## iter 50 value 5.898461
## iter 60 value 5.593734
## iter 70 value 5.371825
## iter 80 value 5.363388
## iter 90 value 5.340072
## iter 100 value 4.903913
## final value 4.903913
## stopped after 100 iterations
## # weights: 42
## initial value 38.411565
## iter 10 value 37.429997
## final value 37.429925
## converged
## # weights: 124
## initial value 37.455841
## iter 10 value 12.230705
## iter 20 value 9.757843
## iter 30 value 9.752730
## iter 40 value 9.752490
## final value 9.752489
## converged
## # weights: 206
## initial value 38.027907
## iter 10 value 4.595234
## iter 20 value 4.318395
## iter 30 value 4.314200
## final value 4.314143
## converged
## # weights: 42
## initial value 41.781294
## iter 10 value 35.372080
## iter 20 value 26.968211
## iter 30 value 18.685653
## iter 40 value 11.501890
## iter 50 value 10.863244
## iter 60 value 10.745051
## final value 10.745017
## converged
## # weights: 124
## initial value 50.444725
## iter 10 value 32.296577
## iter 20 value 22.584457
## iter 30 value 12.480851
## iter 40 value 9.517897
## iter 50 value 8.576779
## iter 60 value 8.411506
## iter 70 value 8.150315
## iter 80 value 8.094945
## iter 90 value 8.094823
## iter 90 value 8.094823
## iter 90 value 8.094823
## final value 8.094823
## converged
## # weights: 206
## initial value 56.026967
## iter 10 value 25.763204
## iter 20 value 14.963263
## iter 30 value 10.603340
## iter 40 value 10.046471
## iter 50 value 9.734191
## iter 60 value 8.564662
## iter 70 value 7.848192
## iter 80 value 7.701127
## iter 90 value 7.639896
## iter 100 value 7.615001
## final value 7.615001
## stopped after 100 iterations
## # weights: 42
## initial value 44.301410
## iter 10 value 17.172409
## iter 20 value 9.906469
## iter 30 value 9.798778
## iter 40 value 9.795227
## iter 50 value 9.792683
## iter 60 value 9.790592
## iter 70 value 9.788245
## iter 80 value 9.787303
## iter 90 value 9.786224
## iter 100 value 9.785821
## final value 9.785821
## stopped after 100 iterations
## # weights: 124
## initial value 37.573923
## iter 10 value 24.576493
## iter 20 value 17.462144
## iter 30 value 16.621938
## iter 40 value 14.335973
## iter 50 value 14.294984
## iter 60 value 14.291039
## iter 70 value 14.284848
## iter 80 value 11.561287
## iter 90 value 11.480860
## iter 100 value 11.479160
## final value 11.479160
## stopped after 100 iterations
## # weights: 206
## initial value 40.573828
## iter 10 value 24.970926
## iter 20 value 8.346592
## iter 30 value 4.631435
## iter 40 value 4.376762
## iter 50 value 4.371902
## iter 60 value 0.391736
## iter 70 value 0.127579
## iter 80 value 0.118906
## iter 90 value 0.114409
## iter 100 value 0.100293
## final value 0.100293
## stopped after 100 iterations
## # weights: 42
## initial value 37.548004
## iter 10 value 22.125877
## iter 20 value 19.523451
## iter 30 value 17.824338
## iter 40 value 17.810105
## iter 50 value 17.809630
## iter 60 value 17.809439
## final value 17.809437
## converged
## # weights: 124
## initial value 40.097215
## iter 10 value 21.435866
## iter 20 value 15.686340
## iter 30 value 15.229455
## iter 40 value 15.051408
## iter 50 value 15.012382
## iter 60 value 14.959856
## iter 70 value 14.792090
## iter 80 value 14.783589
## iter 90 value 14.619619
## iter 100 value 13.214744
## final value 13.214744
## stopped after 100 iterations
## # weights: 206
## initial value 43.660127
## iter 10 value 31.187628
## iter 20 value 14.145669
## iter 30 value 9.308963
## iter 40 value 9.275791
## iter 50 value 9.275109
## final value 9.275106
## converged
## # weights: 42
## initial value 38.389152
## iter 10 value 37.355865
## iter 20 value 19.621582
## iter 30 value 11.952660
## iter 40 value 11.360892
## iter 50 value 11.359728
## final value 11.359728
## converged
## # weights: 124
## initial value 43.765794
## iter 10 value 28.421950
## iter 20 value 18.126650
## iter 30 value 13.435832
## iter 40 value 11.024300
## iter 50 value 9.441826
## iter 60 value 9.099920
## iter 70 value 8.874159
## iter 80 value 8.793238
## iter 90 value 8.792420
## final value 8.792420
## converged
## # weights: 206
## initial value 51.396361
## iter 10 value 33.180808
## iter 20 value 15.210539
## iter 30 value 10.725412
## iter 40 value 9.203667
## iter 50 value 8.980929
## iter 60 value 8.660941
## iter 70 value 8.523127
## iter 80 value 8.465812
## iter 90 value 8.454999
## iter 100 value 8.377635
## final value 8.377635
## stopped after 100 iterations
## # weights: 42
## initial value 37.803164
## iter 10 value 24.812001
## iter 20 value 17.381439
## iter 30 value 17.343615
## iter 40 value 15.729465
## iter 50 value 15.548654
## iter 60 value 9.897766
## iter 70 value 5.345354
## iter 80 value 4.370576
## iter 90 value 4.361931
## iter 100 value 4.360404
## final value 4.360404
## stopped after 100 iterations
## # weights: 124
## initial value 38.798998
## iter 10 value 16.201607
## iter 20 value 11.496287
## iter 30 value 8.633798
## iter 40 value 5.382864
## iter 50 value 4.381763
## iter 60 value 4.377988
## iter 70 value 4.371231
## iter 80 value 1.556101
## iter 90 value 0.106136
## iter 100 value 0.094361
## final value 0.094361
## stopped after 100 iterations
## # weights: 206
## initial value 38.189545
## iter 10 value 25.465546
## iter 20 value 3.854228
## iter 30 value 0.166762
## iter 40 value 0.100195
## iter 50 value 0.095245
## iter 60 value 0.087561
## iter 70 value 0.084259
## iter 80 value 0.080594
## iter 90 value 0.079113
## iter 100 value 0.077520
## final value 0.077520
## stopped after 100 iterations
## # weights: 42
## initial value 39.534061
## iter 10 value 16.731867
## iter 20 value 13.874719
## iter 30 value 13.868775
## final value 13.868775
## converged
## # weights: 124
## initial value 37.463130
## iter 10 value 15.900457
## iter 20 value 11.521418
## iter 30 value 11.409597
## iter 40 value 11.404440
## iter 50 value 11.403604
## final value 11.403474
## converged
## # weights: 206
## initial value 36.782596
## iter 10 value 17.519912
## iter 20 value 13.876930
## iter 30 value 13.861142
## iter 40 value 11.921481
## final value 11.920894
## converged
## # weights: 42
## initial value 37.490271
## iter 10 value 33.829511
## iter 20 value 21.976338
## iter 30 value 13.302050
## iter 40 value 11.023974
## iter 50 value 11.009133
## final value 11.009129
## converged
## # weights: 124
## initial value 38.662704
## iter 10 value 31.659784
## iter 20 value 14.766463
## iter 30 value 10.003717
## iter 40 value 9.045572
## iter 50 value 8.740712
## iter 60 value 8.600255
## iter 70 value 8.479791
## iter 80 value 8.478166
## final value 8.478163
## converged
## # weights: 206
## initial value 40.411684
## iter 10 value 24.896661
## iter 20 value 16.018690
## iter 30 value 10.010429
## iter 40 value 8.807327
## iter 50 value 8.618224
## iter 60 value 8.404275
## iter 70 value 8.208228
## iter 80 value 8.122538
## iter 90 value 8.102755
## iter 100 value 8.092622
## final value 8.092622
## stopped after 100 iterations
## # weights: 42
## initial value 36.822780
## iter 10 value 17.021046
## iter 20 value 11.712270
## iter 30 value 11.444023
## iter 40 value 11.439250
## iter 50 value 11.437732
## iter 60 value 11.436725
## iter 70 value 11.436018
## iter 80 value 11.328138
## iter 90 value 8.627101
## iter 100 value 8.554635
## final value 8.554635
## stopped after 100 iterations
## # weights: 124
## initial value 43.655862
## iter 10 value 36.729205
## iter 20 value 36.727115
## iter 30 value 24.122440
## iter 40 value 14.279970
## iter 50 value 14.250465
## iter 60 value 14.237088
## iter 70 value 14.232674
## iter 80 value 14.227975
## iter 90 value 12.953622
## iter 100 value 9.737514
## final value 9.737514
## stopped after 100 iterations
## # weights: 206
## initial value 48.175910
## iter 10 value 26.785075
## iter 20 value 8.107855
## iter 30 value 7.501987
## iter 40 value 0.727674
## iter 50 value 0.157836
## iter 60 value 0.139401
## iter 70 value 0.132482
## iter 80 value 0.126400
## iter 90 value 0.115716
## iter 100 value 0.105940
## final value 0.105940
## stopped after 100 iterations
## # weights: 42
## initial value 38.025891
## iter 10 value 37.429979
## final value 37.429948
## converged
## # weights: 124
## initial value 41.937550
## iter 10 value 25.479599
## iter 20 value 13.946206
## iter 30 value 12.920776
## iter 40 value 9.896111
## iter 50 value 4.477071
## iter 60 value 4.331683
## iter 70 value 4.318635
## iter 80 value 4.314849
## iter 90 value 4.309166
## iter 100 value 0.119476
## final value 0.119476
## stopped after 100 iterations
## # weights: 206
## initial value 38.479977
## iter 10 value 14.251711
## iter 20 value 12.162107
## iter 30 value 3.251655
## iter 40 value 2.779575
## iter 50 value 2.773293
## iter 60 value 2.772988
## iter 70 value 2.771518
## iter 80 value 1.919216
## iter 90 value 1.910029
## iter 100 value 1.909783
## final value 1.909783
## stopped after 100 iterations
## # weights: 42
## initial value 40.082722
## iter 10 value 31.658924
## iter 20 value 26.534170
## iter 30 value 15.484922
## iter 40 value 10.425718
## iter 50 value 10.336434
## final value 10.336409
## converged
## # weights: 124
## initial value 40.404025
## iter 10 value 20.175198
## iter 20 value 11.633596
## iter 30 value 9.941047
## iter 40 value 8.830414
## iter 50 value 7.901000
## iter 60 value 7.700770
## iter 70 value 7.626051
## iter 80 value 7.612893
## iter 90 value 7.607735
## iter 100 value 7.607378
## final value 7.607378
## stopped after 100 iterations
## # weights: 206
## initial value 47.511931
## iter 10 value 28.905853
## iter 20 value 12.196812
## iter 30 value 9.096650
## iter 40 value 7.947732
## iter 50 value 7.558000
## iter 60 value 7.457165
## iter 70 value 7.231514
## iter 80 value 7.139516
## iter 90 value 7.131985
## iter 100 value 7.127636
## final value 7.127636
## stopped after 100 iterations
## # weights: 42
## initial value 46.051386
## iter 10 value 37.048507
## iter 20 value 26.521176
## iter 30 value 26.457851
## iter 40 value 24.976779
## iter 50 value 8.003395
## iter 60 value 4.377853
## iter 70 value 4.362012
## iter 80 value 4.359140
## iter 90 value 4.355717
## iter 100 value 4.350772
## final value 4.350772
## stopped after 100 iterations
## # weights: 124
## initial value 39.002309
## iter 10 value 19.603628
## iter 20 value 2.177405
## iter 30 value 0.263476
## iter 40 value 0.201591
## iter 50 value 0.166607
## iter 60 value 0.146680
## iter 70 value 0.116197
## iter 80 value 0.086895
## iter 90 value 0.072990
## iter 100 value 0.068951
## final value 0.068951
## stopped after 100 iterations
## # weights: 206
## initial value 42.857255
## iter 10 value 37.158422
## iter 20 value 7.071053
## iter 30 value 4.399584
## iter 40 value 4.382183
## iter 50 value 4.378997
## iter 60 value 4.375335
## iter 70 value 4.370412
## iter 80 value 0.493308
## iter 90 value 0.078963
## iter 100 value 0.067140
## final value 0.067140
## stopped after 100 iterations
## # weights: 42
## initial value 40.210919
## iter 10 value 23.440130
## iter 20 value 14.528303
## iter 30 value 13.880370
## iter 40 value 13.869418
## iter 50 value 13.869106
## iter 60 value 13.868993
## iter 70 value 13.868834
## final value 13.868832
## converged
## # weights: 124
## initial value 37.134392
## iter 10 value 24.560625
## iter 20 value 5.670535
## iter 30 value 4.108917
## iter 40 value 2.736939
## iter 50 value 2.711498
## iter 60 value 2.591346
## iter 70 value 0.147297
## iter 80 value 0.011545
## iter 90 value 0.002107
## iter 100 value 0.001299
## final value 0.001299
## stopped after 100 iterations
## # weights: 206
## initial value 37.508940
## iter 10 value 26.988082
## iter 20 value 16.595331
## iter 30 value 1.809778
## iter 40 value 0.023192
## iter 50 value 0.000441
## final value 0.000091
## converged
## # weights: 42
## initial value 40.980765
## iter 10 value 24.017315
## iter 20 value 13.412198
## iter 30 value 11.821037
## iter 40 value 11.758860
## final value 11.758841
## converged
## # weights: 124
## initial value 43.031852
## iter 10 value 27.048817
## iter 20 value 19.598104
## iter 30 value 15.326800
## iter 40 value 11.756266
## iter 50 value 10.378106
## iter 60 value 9.692596
## iter 70 value 9.640454
## iter 80 value 9.579284
## iter 90 value 9.492822
## iter 100 value 9.274343
## final value 9.274343
## stopped after 100 iterations
## # weights: 206
## initial value 48.075681
## iter 10 value 29.995421
## iter 20 value 20.785802
## iter 30 value 17.396694
## iter 40 value 13.077924
## iter 50 value 10.470727
## iter 60 value 9.444089
## iter 70 value 9.119904
## iter 80 value 9.045365
## iter 90 value 9.010697
## iter 100 value 9.009802
## final value 9.009802
## stopped after 100 iterations
## # weights: 42
## initial value 39.769768
## iter 10 value 21.853350
## iter 20 value 18.872817
## iter 30 value 16.090358
## iter 40 value 13.946389
## iter 50 value 13.927837
## iter 60 value 13.924106
## iter 70 value 13.918881
## iter 80 value 13.915289
## iter 90 value 10.601399
## iter 100 value 9.772545
## final value 9.772545
## stopped after 100 iterations
## # weights: 124
## initial value 37.693101
## iter 10 value 36.729380
## iter 20 value 36.551135
## iter 30 value 15.088364
## iter 40 value 2.012575
## iter 50 value 0.254324
## iter 60 value 0.221866
## iter 70 value 0.166803
## iter 80 value 0.141152
## iter 90 value 0.127642
## iter 100 value 0.118398
## final value 0.118398
## stopped after 100 iterations
## # weights: 206
## initial value 38.127118
## iter 10 value 18.792930
## iter 20 value 7.424094
## iter 30 value 4.590719
## iter 40 value 1.873572
## iter 50 value 0.192823
## iter 60 value 0.129307
## iter 70 value 0.122956
## iter 80 value 0.115540
## iter 90 value 0.102926
## iter 100 value 0.095656
## final value 0.095656
## stopped after 100 iterations
## # weights: 42
## initial value 41.848642
## iter 10 value 37.231512
## iter 20 value 19.239153
## iter 30 value 16.649046
## iter 40 value 16.585121
## iter 50 value 16.584918
## iter 50 value 16.584918
## iter 50 value 16.584918
## final value 16.584918
## converged
## # weights: 124
## initial value 36.609565
## iter 10 value 15.903635
## iter 20 value 6.593715
## iter 30 value 5.587703
## iter 40 value 5.318709
## iter 50 value 5.005900
## iter 60 value 5.004527
## iter 70 value 5.004215
## iter 80 value 5.003362
## iter 90 value 3.470911
## iter 100 value 3.139574
## final value 3.139574
## stopped after 100 iterations
## # weights: 206
## initial value 37.337098
## iter 10 value 18.589783
## iter 20 value 6.019227
## iter 30 value 4.177840
## iter 40 value 2.361041
## iter 50 value 2.265224
## iter 60 value 2.261691
## iter 70 value 2.223261
## iter 80 value 0.102420
## iter 90 value 0.016229
## iter 100 value 0.007621
## final value 0.007621
## stopped after 100 iterations
## # weights: 42
## initial value 38.425938
## iter 10 value 31.910871
## iter 20 value 18.775650
## iter 30 value 14.877947
## iter 40 value 14.477471
## final value 14.477253
## converged
## # weights: 124
## initial value 39.343005
## iter 10 value 27.143197
## iter 20 value 18.447820
## iter 30 value 14.202375
## iter 40 value 11.691931
## iter 50 value 11.096465
## iter 60 value 10.669250
## iter 70 value 10.515583
## iter 80 value 10.371746
## iter 90 value 10.369354
## final value 10.369354
## converged
## # weights: 206
## initial value 43.062347
## iter 10 value 26.378602
## iter 20 value 17.059320
## iter 30 value 12.340350
## iter 40 value 11.398732
## iter 50 value 10.793200
## iter 60 value 10.279638
## iter 70 value 10.172333
## iter 80 value 10.115973
## iter 90 value 10.076911
## iter 100 value 10.058568
## final value 10.058568
## stopped after 100 iterations
## # weights: 42
## initial value 37.436256
## iter 10 value 32.272369
## iter 20 value 31.029835
## iter 30 value 25.305771
## iter 40 value 24.153967
## iter 50 value 24.109644
## iter 60 value 21.752349
## iter 70 value 16.331701
## iter 80 value 14.066942
## iter 90 value 13.932903
## iter 100 value 12.007984
## final value 12.007984
## stopped after 100 iterations
## # weights: 124
## initial value 39.053581
## iter 10 value 36.855421
## iter 20 value 24.644430
## iter 30 value 18.705487
## iter 40 value 14.499210
## iter 50 value 13.360534
## iter 60 value 13.214521
## iter 70 value 8.993740
## iter 80 value 8.600980
## iter 90 value 8.217186
## iter 100 value 8.064280
## final value 8.064280
## stopped after 100 iterations
## # weights: 206
## initial value 54.459283
## iter 10 value 35.952840
## iter 20 value 22.245430
## iter 30 value 18.229399
## iter 40 value 14.248714
## iter 50 value 14.036386
## iter 60 value 11.155018
## iter 70 value 10.694541
## iter 80 value 7.527516
## iter 90 value 6.583679
## iter 100 value 6.234755
## final value 6.234755
## stopped after 100 iterations
## # weights: 42
## initial value 37.488085
## final value 36.727366
## converged
## # weights: 124
## initial value 35.593471
## iter 10 value 13.092325
## iter 20 value 7.302160
## iter 30 value 7.277931
## iter 40 value 7.277699
## final value 7.277689
## converged
## # weights: 206
## initial value 37.897894
## iter 10 value 17.211991
## iter 20 value 8.508231
## iter 30 value 1.642643
## iter 40 value 1.392920
## iter 50 value 1.387089
## iter 60 value 1.386601
## iter 70 value 1.386296
## final value 1.386296
## converged
## # weights: 42
## initial value 39.009752
## iter 10 value 20.348810
## iter 20 value 13.347197
## iter 30 value 13.102301
## final value 13.102221
## converged
## # weights: 124
## initial value 38.887363
## iter 10 value 23.390790
## iter 20 value 10.848009
## iter 30 value 8.731017
## iter 40 value 8.643957
## iter 50 value 8.483798
## iter 60 value 8.299049
## iter 70 value 8.290567
## final value 8.290558
## converged
## # weights: 206
## initial value 45.105708
## iter 10 value 18.617624
## iter 20 value 10.336690
## iter 30 value 8.566743
## iter 40 value 8.134603
## iter 50 value 8.046936
## iter 60 value 7.974381
## iter 70 value 7.942497
## iter 80 value 7.925047
## iter 90 value 7.915593
## final value 7.915552
## converged
## # weights: 42
## initial value 39.307927
## iter 10 value 28.491216
## iter 20 value 21.391781
## iter 30 value 12.385011
## iter 40 value 7.335279
## iter 50 value 7.320454
## iter 60 value 7.307147
## iter 70 value 7.295089
## iter 80 value 7.289126
## iter 90 value 7.279025
## iter 100 value 7.260275
## final value 7.260275
## stopped after 100 iterations
## # weights: 124
## initial value 39.441352
## iter 10 value 20.566067
## iter 20 value 7.334785
## iter 30 value 0.447768
## iter 40 value 0.238724
## iter 50 value 0.210096
## iter 60 value 0.168606
## iter 70 value 0.133141
## iter 80 value 0.119019
## iter 90 value 0.091930
## iter 100 value 0.088367
## final value 0.088367
## stopped after 100 iterations
## # weights: 206
## initial value 73.345166
## iter 10 value 36.745044
## iter 20 value 14.583059
## iter 30 value 11.511169
## iter 40 value 11.434123
## iter 50 value 11.421317
## iter 60 value 11.419148
## iter 70 value 11.414254
## iter 80 value 8.660708
## iter 90 value 8.572572
## iter 100 value 8.565985
## final value 8.565985
## stopped after 100 iterations
## # weights: 42
## initial value 36.712305
## iter 10 value 27.094489
## iter 20 value 16.775631
## iter 30 value 11.535340
## iter 40 value 11.410580
## iter 50 value 11.403638
## iter 60 value 11.403552
## iter 70 value 11.403527
## final value 11.403526
## converged
## # weights: 124
## initial value 41.967192
## iter 10 value 20.457428
## iter 20 value 11.383853
## iter 30 value 5.180016
## iter 40 value 3.216818
## iter 50 value 1.459336
## iter 60 value 1.396565
## iter 70 value 0.123482
## iter 80 value 0.019019
## iter 90 value 0.006086
## iter 100 value 0.002040
## final value 0.002040
## stopped after 100 iterations
## # weights: 206
## initial value 40.505568
## iter 10 value 21.572352
## iter 20 value 6.603643
## iter 30 value 6.169343
## iter 40 value 2.158371
## iter 50 value 1.941948
## iter 60 value 1.913800
## iter 70 value 1.911336
## iter 80 value 1.910117
## iter 90 value 1.909900
## iter 100 value 1.909622
## final value 1.909622
## stopped after 100 iterations
## # weights: 42
## initial value 37.857127
## iter 10 value 31.957989
## iter 20 value 19.670359
## iter 30 value 13.929724
## iter 40 value 11.279966
## iter 50 value 11.029412
## final value 11.029316
## converged
## # weights: 124
## initial value 38.342936
## iter 10 value 17.567166
## iter 20 value 10.899773
## iter 30 value 10.042349
## iter 40 value 8.967881
## iter 50 value 8.749023
## iter 60 value 8.519625
## iter 70 value 8.461196
## iter 80 value 8.460736
## final value 8.460734
## converged
## # weights: 206
## initial value 47.154179
## iter 10 value 32.018575
## iter 20 value 12.543444
## iter 30 value 10.305182
## iter 40 value 8.608957
## iter 50 value 8.058560
## iter 60 value 8.028421
## iter 70 value 8.027040
## final value 8.027019
## converged
## # weights: 42
## initial value 37.106244
## iter 10 value 17.143074
## iter 20 value 10.541141
## iter 30 value 7.283446
## iter 40 value 4.429751
## iter 50 value 4.345495
## iter 60 value 4.337511
## iter 70 value 4.330028
## iter 80 value 4.325601
## iter 90 value 4.322300
## iter 100 value 4.320161
## final value 4.320161
## stopped after 100 iterations
## # weights: 124
## initial value 45.178614
## iter 10 value 25.092366
## iter 20 value 0.347391
## iter 30 value 0.201260
## iter 40 value 0.186050
## iter 50 value 0.163188
## iter 60 value 0.151108
## iter 70 value 0.134415
## iter 80 value 0.119796
## iter 90 value 0.109507
## iter 100 value 0.092051
## final value 0.092051
## stopped after 100 iterations
## # weights: 206
## initial value 38.835100
## iter 10 value 18.634577
## iter 20 value 7.057020
## iter 30 value 1.781651
## iter 40 value 0.554168
## iter 50 value 0.291356
## iter 60 value 0.264583
## iter 70 value 0.224329
## iter 80 value 0.200153
## iter 90 value 0.184896
## iter 100 value 0.156526
## final value 0.156526
## stopped after 100 iterations
## # weights: 124
## initial value 59.503329
## iter 10 value 31.533984
## iter 20 value 23.244036
## iter 30 value 15.611695
## iter 40 value 13.993738
## iter 50 value 13.216017
## iter 60 value 12.999943
## iter 70 value 12.882509
## iter 80 value 12.844642
## iter 90 value 12.776793
## iter 100 value 12.766262
## final value 12.766262
## stopped after 100 iterations
tictoc::toc()
## 4.58 sec elapsed
# Train neural net
tictoc::tic()
set.seed(42)
nn_model_voters_big_grid <- caret::train(turnout16_2016 ~ ., data = voters_train_data, method = "nnet",
trControl = fitControl, verbose = FALSE, tuneGrid = big_grid
)
## # weights: 42
## initial value 37.543902
## iter 10 value 31.027084
## iter 20 value 11.826996
## iter 30 value 11.383068
## iter 40 value 11.369768
## iter 50 value 11.368247
## iter 60 value 11.368065
## final value 11.368020
## converged
## # weights: 83
## initial value 39.766022
## iter 10 value 15.937860
## iter 20 value 11.422396
## iter 30 value 11.375451
## iter 40 value 11.368608
## iter 50 value 11.368059
## final value 11.368036
## converged
## # weights: 124
## initial value 40.595251
## iter 10 value 23.262080
## iter 20 value 18.405645
## iter 30 value 15.651999
## iter 40 value 15.647162
## iter 50 value 15.646567
## iter 60 value 14.041820
## iter 70 value 13.872262
## iter 80 value 13.869025
## final value 13.868985
## converged
## # weights: 165
## initial value 40.365339
## iter 10 value 23.967523
## iter 20 value 6.106408
## iter 30 value 5.743379
## iter 40 value 0.036464
## iter 50 value 0.004550
## iter 60 value 0.000387
## final value 0.000089
## converged
## # weights: 206
## initial value 38.064241
## iter 10 value 13.838685
## iter 20 value 7.748838
## iter 30 value 7.157892
## iter 40 value 7.148233
## iter 50 value 7.147933
## iter 60 value 7.147907
## final value 7.147900
## converged
## # weights: 42
## initial value 43.026823
## iter 10 value 34.625575
## iter 20 value 32.606851
## iter 30 value 32.598476
## final value 32.598460
## converged
## # weights: 83
## initial value 54.080772
## iter 10 value 36.149285
## iter 20 value 33.706759
## iter 30 value 32.579531
## iter 40 value 30.532952
## iter 50 value 30.381645
## final value 30.381475
## converged
## # weights: 124
## initial value 55.946923
## iter 10 value 34.648882
## iter 20 value 30.961345
## iter 30 value 29.938476
## iter 40 value 29.924421
## iter 50 value 29.922615
## final value 29.922606
## converged
## # weights: 165
## initial value 76.853817
## iter 10 value 35.478352
## iter 20 value 30.177435
## iter 30 value 29.678601
## iter 40 value 29.472089
## iter 50 value 29.398254
## iter 60 value 29.393732
## iter 70 value 29.393702
## iter 70 value 29.393702
## iter 70 value 29.393702
## final value 29.393702
## converged
## # weights: 206
## initial value 72.169357
## iter 10 value 35.147438
## iter 20 value 30.651734
## iter 30 value 29.489206
## iter 40 value 29.326520
## iter 50 value 29.214437
## iter 60 value 29.206726
## final value 29.205957
## converged
## # weights: 42
## initial value 37.568710
## iter 10 value 20.528396
## iter 20 value 16.713563
## iter 30 value 12.500015
## iter 40 value 11.459355
## iter 50 value 11.447812
## iter 60 value 11.446075
## iter 70 value 11.444896
## iter 80 value 11.444180
## iter 90 value 11.444148
## iter 100 value 11.443674
## final value 11.443674
## stopped after 100 iterations
## # weights: 83
## initial value 41.854691
## iter 10 value 14.541530
## iter 20 value 10.128697
## iter 30 value 9.762728
## iter 40 value 9.499098
## iter 50 value 4.706413
## iter 60 value 4.320181
## iter 70 value 4.315702
## iter 80 value 4.315041
## iter 90 value 4.314585
## iter 100 value 4.314517
## final value 4.314517
## stopped after 100 iterations
## # weights: 124
## initial value 39.901995
## iter 10 value 29.119445
## iter 20 value 12.304345
## iter 30 value 5.889038
## iter 40 value 5.751436
## iter 50 value 5.732129
## iter 60 value 5.443309
## iter 70 value 0.022520
## final value 0.000051
## converged
## # weights: 165
## initial value 39.029729
## iter 10 value 15.710362
## iter 20 value 0.749600
## iter 30 value 0.005668
## iter 40 value 0.000122
## iter 40 value 0.000074
## iter 40 value 0.000070
## final value 0.000070
## converged
## # weights: 206
## initial value 47.157550
## iter 10 value 23.896673
## iter 20 value 17.318332
## iter 30 value 10.567425
## iter 40 value 9.767979
## iter 50 value 9.756515
## iter 60 value 9.753882
## iter 70 value 9.689147
## iter 80 value 7.306195
## iter 90 value 7.278802
## iter 100 value 5.609310
## final value 5.609310
## stopped after 100 iterations
## # weights: 42
## initial value 45.367030
## iter 10 value 33.596810
## iter 20 value 32.298023
## final value 32.297824
## converged
## # weights: 83
## initial value 53.163581
## iter 10 value 36.185830
## iter 20 value 32.426721
## iter 30 value 30.774634
## iter 40 value 29.834471
## iter 50 value 29.831662
## final value 29.831646
## converged
## # weights: 124
## initial value 62.253214
## iter 10 value 33.520095
## iter 20 value 29.650404
## iter 30 value 29.299605
## iter 40 value 29.240396
## final value 29.240151
## converged
## # weights: 165
## initial value 63.239335
## iter 10 value 33.441372
## iter 20 value 29.287492
## iter 30 value 28.650045
## iter 40 value 28.597879
## iter 50 value 28.597343
## final value 28.597343
## converged
## # weights: 206
## initial value 85.700902
## iter 10 value 34.554528
## iter 20 value 30.436849
## iter 30 value 28.637195
## iter 40 value 28.350130
## iter 50 value 28.347024
## final value 28.347017
## converged
## # weights: 42
## initial value 37.900580
## iter 10 value 23.617936
## iter 20 value 18.547715
## iter 30 value 18.521900
## iter 40 value 18.519423
## iter 50 value 18.518999
## iter 60 value 18.515145
## iter 70 value 16.570951
## iter 80 value 14.237539
## iter 90 value 14.181777
## iter 100 value 14.180876
## final value 14.180876
## stopped after 100 iterations
## # weights: 83
## initial value 36.916722
## iter 10 value 36.727349
## iter 20 value 35.059780
## iter 30 value 32.200624
## iter 40 value 13.899447
## iter 50 value 13.698580
## iter 60 value 13.695990
## final value 13.695938
## converged
## # weights: 124
## initial value 37.451885
## iter 10 value 18.013827
## iter 20 value 13.619063
## iter 30 value 8.391131
## iter 40 value 8.230179
## iter 50 value 7.715214
## iter 60 value 7.686716
## iter 70 value 7.685461
## iter 80 value 7.614544
## iter 90 value 7.600645
## iter 100 value 7.514793
## final value 7.514793
## stopped after 100 iterations
## # weights: 165
## initial value 38.247757
## iter 10 value 22.338755
## iter 20 value 13.909412
## iter 30 value 13.738750
## iter 40 value 13.734036
## iter 50 value 13.732982
## iter 60 value 13.732601
## final value 13.732598
## converged
## # weights: 206
## initial value 38.440112
## iter 10 value 19.380722
## iter 20 value 9.743034
## iter 30 value 7.411345
## iter 40 value 0.088703
## iter 50 value 0.003649
## final value 0.000089
## converged
## # weights: 42
## initial value 46.391221
## iter 10 value 36.545401
## iter 20 value 31.412226
## iter 30 value 31.230800
## final value 31.230763
## converged
## # weights: 83
## initial value 55.121059
## iter 10 value 36.549576
## iter 20 value 31.908430
## iter 30 value 31.344571
## iter 40 value 30.844813
## iter 50 value 30.446359
## iter 60 value 30.444957
## final value 30.444954
## converged
## # weights: 124
## initial value 65.332656
## iter 10 value 34.743900
## iter 20 value 31.124164
## iter 30 value 28.914376
## iter 40 value 28.108523
## iter 50 value 28.022216
## iter 60 value 28.020085
## iter 60 value 28.020085
## iter 60 value 28.020085
## final value 28.020085
## converged
## # weights: 165
## initial value 65.640103
## iter 10 value 31.558324
## iter 20 value 29.115526
## iter 30 value 28.127830
## iter 40 value 27.736573
## iter 50 value 27.404964
## iter 60 value 27.398359
## final value 27.398348
## converged
## # weights: 206
## initial value 74.842233
## iter 10 value 32.556911
## iter 20 value 27.873902
## iter 30 value 27.386812
## iter 40 value 27.191109
## iter 50 value 27.173951
## iter 60 value 27.171327
## iter 60 value 27.171327
## iter 60 value 27.171327
## final value 27.171327
## converged
## # weights: 42
## initial value 38.783776
## iter 10 value 15.839496
## iter 20 value 9.031899
## iter 30 value 4.373613
## iter 40 value 4.315676
## iter 50 value 4.315058
## iter 60 value 4.314250
## iter 70 value 4.314186
## final value 4.314131
## converged
## # weights: 83
## initial value 38.695867
## iter 10 value 13.075593
## iter 20 value 4.754923
## iter 30 value 0.096104
## iter 40 value 0.002264
## iter 50 value 0.000312
## final value 0.000083
## converged
## # weights: 124
## initial value 46.247989
## iter 10 value 15.875757
## iter 20 value 4.075813
## iter 30 value 2.506456
## iter 40 value 2.502080
## final value 2.502012
## converged
## # weights: 165
## initial value 44.061821
## iter 10 value 20.078951
## iter 20 value 3.574405
## iter 30 value 0.243555
## iter 40 value 0.002424
## iter 50 value 0.000577
## iter 60 value 0.000115
## iter 60 value 0.000092
## iter 60 value 0.000090
## final value 0.000090
## converged
## # weights: 206
## initial value 37.575512
## iter 10 value 24.986633
## iter 20 value 18.820758
## iter 30 value 15.694899
## iter 40 value 15.650033
## iter 50 value 15.648400
## iter 60 value 15.646978
## iter 70 value 15.587156
## iter 80 value 6.940235
## iter 90 value 4.317656
## iter 100 value 4.315110
## final value 4.315110
## stopped after 100 iterations
## # weights: 42
## initial value 48.033054
## iter 10 value 35.563953
## iter 20 value 32.471288
## iter 30 value 32.435824
## final value 32.435717
## converged
## # weights: 83
## initial value 58.979426
## iter 10 value 34.264113
## iter 20 value 32.456782
## iter 30 value 32.132788
## iter 40 value 31.678700
## iter 50 value 31.677489
## final value 31.677474
## converged
## # weights: 124
## initial value 58.111063
## iter 10 value 35.757496
## iter 20 value 30.458979
## iter 30 value 29.445538
## iter 40 value 29.401603
## iter 50 value 29.397377
## iter 60 value 29.396892
## final value 29.396879
## converged
## # weights: 165
## initial value 60.842265
## iter 10 value 35.095744
## iter 20 value 30.670761
## iter 30 value 29.565826
## iter 40 value 29.440583
## iter 50 value 29.085018
## iter 60 value 28.835858
## iter 70 value 28.832016
## iter 80 value 28.831029
## final value 28.831026
## converged
## # weights: 206
## initial value 70.575570
## iter 10 value 36.419099
## iter 20 value 30.925942
## iter 30 value 29.213017
## iter 40 value 28.678287
## iter 50 value 28.617776
## iter 60 value 28.604114
## iter 70 value 28.603628
## final value 28.603595
## converged
## # weights: 42
## initial value 36.113034
## iter 10 value 18.542105
## iter 20 value 14.102225
## iter 30 value 11.337182
## iter 40 value 11.331034
## iter 50 value 11.329888
## iter 60 value 11.329013
## iter 70 value 11.328435
## iter 80 value 11.328078
## iter 90 value 11.328047
## final value 11.328043
## converged
## # weights: 83
## initial value 36.167664
## iter 10 value 29.608404
## iter 20 value 11.657390
## iter 30 value 11.343257
## iter 40 value 9.944105
## iter 50 value 8.479723
## iter 60 value 8.477983
## iter 70 value 8.477756
## iter 80 value 8.477655
## iter 90 value 8.477421
## iter 100 value 8.477355
## final value 8.477355
## stopped after 100 iterations
## # weights: 124
## initial value 36.617189
## iter 10 value 18.844325
## iter 20 value 15.444874
## iter 30 value 11.978516
## iter 40 value 9.670243
## iter 50 value 9.646207
## iter 60 value 9.645359
## final value 9.645357
## converged
## # weights: 165
## initial value 37.192876
## iter 10 value 23.741880
## iter 20 value 9.040261
## iter 30 value 4.315039
## iter 40 value 2.795581
## iter 50 value 2.726782
## iter 60 value 2.705688
## iter 70 value 2.332897
## iter 80 value 2.246385
## iter 90 value 1.925591
## iter 100 value 1.913708
## final value 1.913708
## stopped after 100 iterations
## # weights: 206
## initial value 36.801803
## iter 10 value 15.150003
## iter 20 value 11.372518
## iter 30 value 11.339694
## iter 40 value 11.333059
## iter 50 value 11.328783
## iter 60 value 11.328400
## iter 70 value 11.328036
## iter 80 value 11.328018
## final value 11.328013
## converged
## # weights: 42
## initial value 52.712783
## iter 10 value 35.810004
## iter 20 value 31.692653
## iter 30 value 31.521966
## final value 31.521780
## converged
## # weights: 83
## initial value 49.791005
## iter 10 value 31.784732
## iter 20 value 30.300636
## iter 30 value 29.222412
## iter 40 value 29.216654
## final value 29.216653
## converged
## # weights: 124
## initial value 61.333051
## iter 10 value 32.386772
## iter 20 value 29.598959
## iter 30 value 28.721693
## iter 40 value 28.696703
## final value 28.696698
## converged
## # weights: 165
## initial value 67.695635
## iter 10 value 33.636308
## iter 20 value 29.364281
## iter 30 value 28.887317
## iter 40 value 28.153950
## iter 50 value 28.145835
## iter 60 value 28.145565
## final value 28.145496
## converged
## # weights: 206
## initial value 76.448638
## iter 10 value 36.245376
## iter 20 value 29.422675
## iter 30 value 28.971459
## iter 40 value 28.200392
## iter 50 value 27.975481
## iter 60 value 27.955356
## iter 70 value 27.949378
## iter 80 value 27.947282
## final value 27.947256
## converged
## # weights: 42
## initial value 39.579312
## iter 10 value 25.728578
## iter 20 value 20.797249
## iter 30 value 16.223903
## iter 40 value 15.871700
## iter 50 value 13.873213
## iter 60 value 10.919115
## iter 70 value 9.764265
## iter 80 value 9.754678
## iter 90 value 9.752892
## iter 100 value 9.752708
## final value 9.752708
## stopped after 100 iterations
## # weights: 83
## initial value 39.264501
## iter 10 value 26.315236
## iter 20 value 19.062494
## iter 30 value 18.815772
## iter 40 value 18.814083
## final value 18.814056
## converged
## # weights: 124
## initial value 40.700436
## iter 10 value 18.236900
## iter 20 value 9.173589
## iter 30 value 8.959864
## iter 40 value 8.909185
## iter 50 value 8.907125
## iter 60 value 8.906041
## iter 70 value 8.905927
## iter 80 value 8.905240
## iter 90 value 8.851523
## iter 100 value 8.762926
## final value 8.762926
## stopped after 100 iterations
## # weights: 165
## initial value 40.356085
## iter 10 value 25.887552
## iter 20 value 10.215281
## iter 30 value 9.341010
## iter 40 value 8.065364
## iter 50 value 4.176078
## iter 60 value 2.814463
## iter 70 value 0.113483
## iter 80 value 0.037607
## iter 90 value 0.006605
## iter 100 value 0.002313
## final value 0.002313
## stopped after 100 iterations
## # weights: 206
## initial value 38.239559
## iter 10 value 19.813053
## iter 20 value 8.539581
## iter 30 value 0.488021
## iter 40 value 0.005855
## final value 0.000068
## converged
## # weights: 42
## initial value 45.361364
## iter 10 value 37.176956
## iter 20 value 35.403506
## iter 30 value 32.639430
## iter 40 value 32.618884
## final value 32.618861
## converged
## # weights: 83
## initial value 53.956602
## iter 10 value 35.413312
## iter 20 value 30.685068
## iter 30 value 30.145695
## final value 30.145006
## converged
## # weights: 124
## initial value 54.324599
## iter 10 value 37.124107
## iter 20 value 31.495950
## iter 30 value 29.756525
## iter 40 value 29.654111
## iter 50 value 29.649305
## final value 29.648723
## converged
## # weights: 165
## initial value 69.968800
## iter 10 value 36.451110
## iter 20 value 30.937750
## iter 30 value 29.618580
## iter 40 value 29.082817
## iter 50 value 29.063351
## iter 60 value 29.057411
## iter 70 value 29.056566
## final value 29.056553
## converged
## # weights: 206
## initial value 77.828002
## iter 10 value 36.669934
## iter 20 value 31.609428
## iter 30 value 29.233791
## iter 40 value 28.915612
## iter 50 value 28.858228
## iter 60 value 28.847193
## iter 70 value 28.840738
## iter 80 value 28.839919
## final value 28.839898
## converged
## # weights: 42
## initial value 37.145482
## iter 10 value 21.377848
## iter 20 value 13.923663
## iter 30 value 11.360677
## iter 40 value 11.342557
## iter 50 value 11.334197
## iter 60 value 11.329747
## iter 70 value 11.290623
## iter 80 value 8.573542
## iter 90 value 8.483375
## iter 100 value 8.478289
## final value 8.478289
## stopped after 100 iterations
## # weights: 83
## initial value 35.791277
## iter 10 value 18.463347
## iter 20 value 17.586643
## iter 30 value 17.585594
## iter 40 value 17.585199
## iter 50 value 17.584804
## final value 17.584799
## converged
## # weights: 124
## initial value 40.551560
## iter 10 value 26.117834
## iter 20 value 22.104230
## iter 30 value 22.075337
## iter 40 value 22.074941
## iter 50 value 21.554845
## iter 60 value 15.534701
## iter 70 value 12.126068
## iter 80 value 12.109831
## iter 90 value 12.108911
## iter 100 value 12.108873
## final value 12.108873
## stopped after 100 iterations
## # weights: 165
## initial value 36.533399
## iter 10 value 22.153829
## iter 20 value 9.480934
## iter 30 value 2.065191
## iter 40 value 1.911985
## iter 50 value 1.909798
## iter 60 value 1.908344
## iter 70 value 1.877561
## iter 80 value 1.396355
## iter 90 value 1.386383
## iter 100 value 1.386337
## final value 1.386337
## stopped after 100 iterations
## # weights: 206
## initial value 39.549691
## iter 10 value 11.099303
## iter 20 value 4.188443
## iter 30 value 1.538773
## iter 40 value 0.069100
## iter 50 value 0.003998
## iter 60 value 0.000697
## iter 70 value 0.000285
## iter 80 value 0.000102
## iter 80 value 0.000098
## iter 80 value 0.000097
## final value 0.000097
## converged
## # weights: 42
## initial value 46.978003
## iter 10 value 32.526409
## iter 20 value 31.509557
## iter 30 value 31.508021
## iter 30 value 31.508021
## iter 30 value 31.508021
## final value 31.508021
## converged
## # weights: 83
## initial value 52.315328
## iter 10 value 33.920356
## iter 20 value 30.066642
## iter 30 value 29.337973
## iter 40 value 29.333758
## final value 29.333757
## converged
## # weights: 124
## initial value 67.674335
## iter 10 value 32.237658
## iter 20 value 28.984203
## iter 30 value 28.830153
## iter 40 value 28.822539
## final value 28.822396
## converged
## # weights: 165
## initial value 68.606108
## iter 10 value 34.225190
## iter 20 value 29.546285
## iter 30 value 28.472313
## iter 40 value 28.234751
## iter 50 value 28.229553
## final value 28.229485
## converged
## # weights: 206
## initial value 72.350117
## iter 10 value 34.544980
## iter 20 value 29.035050
## iter 30 value 28.193454
## iter 40 value 28.040294
## iter 50 value 28.031525
## iter 60 value 28.030020
## iter 70 value 28.029765
## final value 28.029764
## converged
## # weights: 42
## initial value 38.983841
## iter 10 value 36.945043
## iter 20 value 17.846729
## iter 30 value 13.908689
## iter 40 value 13.848704
## iter 50 value 13.844514
## iter 60 value 13.843839
## final value 13.843836
## converged
## # weights: 83
## initial value 40.599302
## iter 10 value 26.510421
## iter 20 value 24.077895
## iter 30 value 24.072512
## iter 40 value 22.865715
## iter 50 value 22.863960
## final value 22.863958
## converged
## # weights: 124
## initial value 40.091939
## iter 10 value 15.076666
## iter 20 value 3.885296
## iter 30 value 0.080618
## iter 40 value 0.003721
## iter 50 value 0.000743
## iter 60 value 0.000383
## final value 0.000098
## converged
## # weights: 165
## initial value 36.524116
## iter 10 value 12.801265
## iter 20 value 9.754990
## iter 30 value 9.752600
## iter 40 value 9.752516
## final value 9.752502
## converged
## # weights: 206
## initial value 44.059291
## iter 10 value 26.028730
## iter 20 value 10.028902
## iter 30 value 7.359834
## iter 40 value 7.055501
## iter 50 value 4.317877
## iter 60 value 4.315698
## iter 70 value 4.315376
## iter 80 value 4.314719
## iter 90 value 4.314596
## iter 100 value 4.314551
## final value 4.314551
## stopped after 100 iterations
## # weights: 42
## initial value 44.829694
## iter 10 value 37.164669
## iter 20 value 32.315274
## iter 30 value 31.597311
## final value 31.597195
## converged
## # weights: 83
## initial value 53.846544
## iter 10 value 36.708632
## iter 20 value 31.362736
## iter 30 value 30.697641
## iter 40 value 30.656172
## final value 30.656138
## converged
## # weights: 124
## initial value 58.014676
## iter 10 value 33.391484
## iter 20 value 30.639536
## iter 30 value 28.465306
## iter 40 value 28.220779
## iter 50 value 28.181563
## iter 60 value 28.180940
## final value 28.180938
## converged
## # weights: 165
## initial value 72.087055
## iter 10 value 31.654461
## iter 20 value 28.215493
## iter 30 value 27.941250
## iter 40 value 27.937605
## final value 27.937505
## converged
## # weights: 206
## initial value 71.085007
## iter 10 value 35.183898
## iter 20 value 30.340498
## iter 30 value 29.260276
## iter 40 value 27.512303
## iter 50 value 27.282594
## iter 60 value 27.261252
## iter 70 value 27.252063
## iter 80 value 27.249295
## final value 27.249227
## converged
## # weights: 42
## initial value 37.263985
## iter 10 value 22.037025
## iter 20 value 13.926130
## iter 30 value 13.868823
## final value 13.868764
## converged
## # weights: 83
## initial value 41.939748
## iter 10 value 22.431472
## iter 20 value 13.754009
## iter 30 value 11.483125
## iter 40 value 11.458611
## iter 50 value 11.447767
## iter 60 value 11.444439
## iter 70 value 11.443995
## iter 80 value 11.437038
## iter 90 value 8.574862
## iter 100 value 8.555570
## final value 8.555570
## stopped after 100 iterations
## # weights: 124
## initial value 38.938475
## iter 10 value 31.394824
## iter 20 value 22.874376
## iter 30 value 22.863896
## final value 22.863843
## converged
## # weights: 165
## initial value 38.613796
## iter 10 value 16.154408
## iter 20 value 7.209007
## iter 30 value 6.138269
## iter 40 value 6.109694
## iter 50 value 6.108725
## final value 6.108643
## converged
## # weights: 206
## initial value 38.764107
## iter 10 value 37.206892
## iter 20 value 26.683751
## iter 30 value 13.628183
## iter 40 value 4.789966
## iter 50 value 2.894662
## iter 60 value 2.708336
## iter 70 value 2.704055
## iter 80 value 2.466933
## iter 90 value 0.022585
## iter 100 value 0.010032
## final value 0.010032
## stopped after 100 iterations
## # weights: 42
## initial value 42.679442
## iter 10 value 36.491583
## iter 20 value 32.969804
## iter 30 value 32.953521
## final value 32.953507
## converged
## # weights: 83
## initial value 51.776707
## iter 10 value 33.758662
## iter 20 value 31.793297
## iter 30 value 30.589350
## iter 40 value 30.581061
## final value 30.581041
## converged
## # weights: 124
## initial value 59.260609
## iter 10 value 37.263219
## iter 20 value 32.358518
## iter 30 value 30.269878
## iter 40 value 30.074407
## iter 50 value 30.073594
## iter 50 value 30.073594
## iter 50 value 30.073594
## final value 30.073594
## converged
## # weights: 165
## initial value 64.241710
## iter 10 value 34.207571
## iter 20 value 30.538604
## iter 30 value 30.178039
## iter 40 value 30.170508
## iter 50 value 29.902557
## iter 60 value 29.581875
## iter 70 value 29.581523
## final value 29.581522
## converged
## # weights: 206
## initial value 84.381514
## iter 10 value 36.024289
## iter 20 value 31.694510
## iter 30 value 30.215405
## iter 40 value 29.804800
## iter 50 value 29.489896
## iter 60 value 29.416283
## iter 70 value 29.408418
## iter 80 value 29.377199
## iter 90 value 29.376424
## final value 29.376424
## converged
## # weights: 42
## initial value 37.452108
## iter 10 value 36.727379
## final value 36.727366
## converged
## # weights: 83
## initial value 37.089823
## iter 10 value 13.531837
## iter 20 value 10.761579
## iter 30 value 8.247449
## iter 40 value 7.303411
## iter 50 value 7.280418
## iter 60 value 7.277195
## iter 70 value 1.038631
## iter 80 value 0.012276
## iter 90 value 0.005247
## iter 100 value 0.000212
## final value 0.000212
## stopped after 100 iterations
## # weights: 124
## initial value 40.586404
## iter 10 value 23.029633
## iter 20 value 20.306395
## iter 30 value 17.778686
## iter 40 value 16.524855
## iter 50 value 15.901260
## iter 60 value 15.897095
## iter 70 value 15.895263
## iter 80 value 15.894860
## iter 90 value 15.893714
## final value 15.893675
## converged
## # weights: 165
## initial value 39.170927
## iter 10 value 19.291981
## iter 20 value 11.927351
## iter 30 value 11.920808
## iter 40 value 11.896874
## iter 50 value 9.204165
## iter 60 value 0.204938
## iter 70 value 0.007805
## iter 80 value 0.001966
## final value 0.000059
## converged
## # weights: 206
## initial value 38.811497
## iter 10 value 20.919958
## iter 20 value 10.000992
## iter 30 value 6.171836
## iter 40 value 4.971991
## iter 50 value 1.130037
## iter 60 value 0.052622
## iter 70 value 0.002908
## iter 80 value 0.000346
## final value 0.000084
## converged
## # weights: 42
## initial value 46.073953
## iter 10 value 33.768493
## iter 20 value 32.076328
## iter 30 value 32.074976
## final value 32.074970
## converged
## # weights: 83
## initial value 52.801976
## iter 10 value 35.325993
## iter 20 value 32.244403
## iter 30 value 31.853815
## iter 40 value 29.796529
## iter 50 value 29.776038
## iter 60 value 29.775947
## iter 60 value 29.775947
## iter 60 value 29.775947
## final value 29.775947
## converged
## # weights: 124
## initial value 57.494961
## iter 10 value 34.430731
## iter 20 value 30.159546
## iter 30 value 29.256161
## iter 40 value 29.250137
## iter 50 value 29.249347
## final value 29.249344
## converged
## # weights: 165
## initial value 64.885477
## iter 10 value 31.607186
## iter 20 value 29.044896
## iter 30 value 28.800421
## iter 40 value 28.713559
## iter 50 value 28.713147
## final value 28.713135
## converged
## # weights: 206
## initial value 70.239027
## iter 10 value 35.735459
## iter 20 value 29.687238
## iter 30 value 28.871590
## iter 40 value 28.711902
## iter 50 value 28.541197
## iter 60 value 28.509971
## iter 70 value 28.505564
## iter 80 value 28.505190
## final value 28.505189
## converged
## # weights: 42
## initial value 37.541773
## iter 10 value 26.400635
## iter 20 value 25.141264
## iter 30 value 25.127353
## iter 40 value 22.541044
## iter 50 value 22.522353
## iter 60 value 22.520717
## iter 70 value 22.520576
## iter 80 value 22.520534
## final value 22.520533
## converged
## # weights: 83
## initial value 37.391155
## iter 10 value 22.351072
## iter 20 value 9.988901
## iter 30 value 9.521244
## iter 40 value 8.445055
## iter 50 value 8.425709
## iter 60 value 8.064217
## iter 70 value 7.685241
## iter 80 value 7.155245
## iter 90 value 7.013396
## iter 100 value 7.012708
## final value 7.012708
## stopped after 100 iterations
## # weights: 124
## initial value 41.516768
## iter 10 value 13.882734
## iter 20 value 10.952788
## iter 30 value 8.604887
## iter 40 value 4.898050
## iter 50 value 4.241550
## iter 60 value 3.200280
## iter 70 value 2.869080
## iter 80 value 2.505753
## iter 90 value 2.268951
## iter 100 value 2.251914
## final value 2.251914
## stopped after 100 iterations
## # weights: 165
## initial value 38.033717
## iter 10 value 13.548790
## iter 20 value 4.327251
## iter 30 value 4.314282
## iter 40 value 4.314147
## final value 4.314132
## converged
## # weights: 206
## initial value 45.709612
## iter 10 value 15.650864
## iter 20 value 4.461805
## iter 30 value 4.317804
## iter 40 value 4.314156
## final value 4.314131
## converged
## # weights: 42
## initial value 48.631896
## iter 10 value 32.527159
## iter 20 value 31.633767
## final value 31.633151
## converged
## # weights: 83
## initial value 52.475052
## iter 10 value 35.157291
## iter 20 value 31.222149
## iter 30 value 29.143014
## iter 40 value 28.861232
## final value 28.861136
## converged
## # weights: 124
## initial value 62.963033
## iter 10 value 36.842213
## iter 20 value 29.941479
## iter 30 value 28.359467
## iter 40 value 28.263168
## iter 50 value 28.258782
## iter 60 value 28.258498
## final value 28.258496
## converged
## # weights: 165
## initial value 73.162517
## iter 10 value 33.074844
## iter 20 value 28.562802
## iter 30 value 27.726274
## iter 40 value 27.633304
## iter 50 value 27.630075
## final value 27.629918
## converged
## # weights: 206
## initial value 75.727416
## iter 10 value 35.714576
## iter 20 value 30.156688
## iter 30 value 28.468872
## iter 40 value 27.908545
## iter 50 value 27.465390
## iter 60 value 27.419375
## iter 70 value 27.405363
## iter 80 value 27.401569
## iter 90 value 27.401427
## final value 27.401425
## converged
## # weights: 42
## initial value 36.802904
## iter 10 value 36.724377
## iter 20 value 31.449425
## iter 30 value 23.571643
## iter 40 value 15.723925
## iter 50 value 15.647282
## iter 60 value 15.646654
## final value 15.646623
## converged
## # weights: 83
## initial value 38.463306
## iter 10 value 35.848627
## iter 20 value 24.139160
## iter 30 value 16.058381
## iter 40 value 9.385229
## iter 50 value 8.515019
## iter 60 value 8.485930
## iter 70 value 8.477116
## iter 80 value 8.476360
## iter 90 value 8.476063
## iter 100 value 8.475919
## final value 8.475919
## stopped after 100 iterations
## # weights: 124
## initial value 36.862867
## iter 10 value 24.102002
## iter 20 value 14.788449
## iter 30 value 7.908615
## iter 40 value 2.462946
## iter 50 value 1.482587
## iter 60 value 1.402463
## iter 70 value 1.390389
## iter 80 value 0.075347
## iter 90 value 0.019340
## iter 100 value 0.003484
## final value 0.003484
## stopped after 100 iterations
## # weights: 165
## initial value 36.762600
## iter 10 value 12.960179
## iter 20 value 0.734084
## iter 30 value 0.004818
## iter 40 value 0.000171
## final value 0.000028
## converged
## # weights: 206
## initial value 38.056673
## iter 10 value 17.217565
## iter 20 value 12.578088
## iter 30 value 11.819391
## iter 40 value 9.769091
## iter 50 value 9.756511
## iter 60 value 9.754197
## iter 70 value 7.364423
## iter 80 value 5.689687
## iter 90 value 4.318302
## iter 100 value 4.315607
## final value 4.315607
## stopped after 100 iterations
## # weights: 42
## initial value 44.240704
## iter 10 value 34.739134
## iter 20 value 32.274097
## iter 30 value 32.266622
## final value 32.266620
## converged
## # weights: 83
## initial value 49.483770
## iter 10 value 36.267661
## iter 20 value 31.362202
## iter 30 value 29.989687
## iter 40 value 29.844273
## final value 29.844139
## converged
## # weights: 124
## initial value 58.342469
## iter 10 value 35.906190
## iter 20 value 30.940039
## iter 30 value 29.624763
## iter 40 value 29.297640
## iter 50 value 29.294130
## iter 60 value 29.293907
## final value 29.293897
## converged
## # weights: 165
## initial value 65.922376
## iter 10 value 34.063292
## iter 20 value 29.995251
## iter 30 value 29.270770
## iter 40 value 28.775425
## iter 50 value 28.744018
## iter 60 value 28.743467
## iter 70 value 28.743405
## final value 28.743405
## converged
## # weights: 206
## initial value 70.258019
## iter 10 value 33.613285
## iter 20 value 29.340304
## iter 30 value 28.755190
## iter 40 value 28.709487
## iter 50 value 28.558564
## iter 60 value 28.549334
## iter 70 value 28.549282
## final value 28.549281
## converged
## # weights: 42
## initial value 38.873502
## iter 10 value 27.463174
## iter 20 value 25.496490
## iter 30 value 22.359805
## iter 40 value 22.043627
## iter 50 value 20.364376
## iter 60 value 17.122479
## iter 70 value 16.602842
## iter 80 value 16.592783
## iter 90 value 16.588267
## iter 100 value 16.586596
## final value 16.586596
## stopped after 100 iterations
## # weights: 83
## initial value 38.028342
## iter 10 value 21.272785
## iter 20 value 16.612603
## iter 30 value 14.899140
## iter 40 value 13.850710
## iter 50 value 13.847035
## iter 60 value 13.845480
## iter 70 value 13.844908
## iter 80 value 13.844190
## iter 90 value 13.844088
## iter 100 value 13.843964
## final value 13.843964
## stopped after 100 iterations
## # weights: 124
## initial value 41.427311
## iter 10 value 17.505370
## iter 20 value 11.810086
## iter 30 value 9.838674
## iter 40 value 9.822190
## iter 50 value 9.820558
## iter 60 value 9.819934
## final value 9.819908
## converged
## # weights: 165
## initial value 38.377271
## iter 10 value 25.540316
## iter 20 value 21.535290
## iter 30 value 9.879150
## iter 40 value 4.443147
## iter 50 value 3.864126
## iter 60 value 3.826843
## iter 70 value 3.576864
## iter 80 value 1.951754
## iter 90 value 1.921169
## iter 100 value 1.649334
## final value 1.649334
## stopped after 100 iterations
## # weights: 206
## initial value 40.937530
## iter 10 value 24.758108
## iter 20 value 16.530500
## iter 30 value 14.619314
## iter 40 value 13.432714
## iter 50 value 13.412821
## iter 60 value 13.406982
## iter 70 value 13.292743
## iter 80 value 13.287146
## iter 90 value 13.287051
## iter 100 value 13.286180
## final value 13.286180
## stopped after 100 iterations
## # weights: 42
## initial value 46.893364
## iter 10 value 37.235405
## iter 20 value 33.772756
## iter 30 value 33.593184
## final value 33.593076
## converged
## # weights: 83
## initial value 50.076399
## iter 10 value 35.004745
## iter 20 value 32.027528
## iter 30 value 31.857588
## iter 40 value 31.673598
## iter 50 value 31.670622
## final value 31.670603
## converged
## # weights: 124
## initial value 64.144959
## iter 10 value 35.992016
## iter 20 value 31.733684
## iter 30 value 31.245672
## iter 40 value 31.229032
## iter 50 value 31.226533
## iter 60 value 31.226044
## iter 60 value 31.226044
## iter 60 value 31.226044
## final value 31.226044
## converged
## # weights: 165
## initial value 67.277161
## iter 10 value 34.681115
## iter 20 value 31.738505
## iter 30 value 31.188559
## iter 40 value 31.122858
## iter 50 value 31.091790
## iter 60 value 31.091600
## final value 31.091502
## converged
## # weights: 206
## initial value 71.726023
## iter 10 value 36.297715
## iter 20 value 32.621907
## iter 30 value 30.881638
## iter 40 value 30.595635
## iter 50 value 30.557812
## iter 60 value 30.551557
## iter 70 value 30.551150
## final value 30.551128
## converged
## # weights: 42
## initial value 38.761438
## final value 36.727366
## converged
## # weights: 83
## initial value 37.104068
## iter 10 value 22.594903
## iter 20 value 11.389778
## iter 30 value 11.211085
## iter 40 value 11.207880
## final value 11.207849
## converged
## # weights: 124
## initial value 37.679868
## iter 10 value 22.544810
## iter 20 value 7.703849
## iter 30 value 1.568541
## iter 40 value 0.039338
## iter 50 value 0.000391
## final value 0.000062
## converged
## # weights: 165
## initial value 37.314057
## iter 10 value 14.393939
## iter 20 value 11.075259
## iter 30 value 8.577798
## iter 40 value 8.089404
## iter 50 value 8.066762
## iter 60 value 8.058540
## iter 70 value 7.609696
## iter 80 value 7.604990
## iter 90 value 7.604267
## iter 100 value 7.604159
## final value 7.604159
## stopped after 100 iterations
## # weights: 206
## initial value 38.366015
## iter 10 value 13.549450
## iter 20 value 4.258180
## iter 30 value 2.055016
## iter 40 value 1.929145
## iter 50 value 1.912337
## iter 60 value 1.911542
## iter 70 value 1.909752
## iter 80 value 0.269463
## iter 90 value 0.004006
## iter 100 value 0.000392
## final value 0.000392
## stopped after 100 iterations
## # weights: 42
## initial value 42.532564
## iter 10 value 31.737216
## iter 20 value 31.510626
## final value 31.510570
## converged
## # weights: 83
## initial value 50.030179
## iter 10 value 33.072089
## iter 20 value 28.982620
## iter 30 value 28.874714
## iter 40 value 28.873172
## final value 28.873171
## converged
## # weights: 124
## initial value 58.128842
## iter 10 value 32.990424
## iter 20 value 29.419985
## iter 30 value 28.852844
## iter 40 value 28.360847
## iter 50 value 28.276144
## iter 60 value 28.275304
## final value 28.275300
## converged
## # weights: 165
## initial value 65.179239
## iter 10 value 36.475489
## iter 20 value 30.015834
## iter 30 value 28.319212
## iter 40 value 27.766275
## iter 50 value 27.673135
## iter 60 value 27.672377
## iter 60 value 27.672377
## iter 60 value 27.672377
## final value 27.672377
## converged
## # weights: 206
## initial value 66.436741
## iter 10 value 37.164963
## iter 20 value 29.220663
## iter 30 value 27.831162
## iter 40 value 27.657073
## iter 50 value 27.456328
## iter 60 value 27.444083
## iter 70 value 27.441190
## iter 80 value 27.440661
## final value 27.440652
## converged
## # weights: 42
## initial value 38.020564
## iter 10 value 21.169013
## iter 20 value 11.902843
## iter 30 value 9.660604
## iter 40 value 9.649831
## iter 50 value 9.647173
## iter 60 value 9.645869
## iter 70 value 9.645364
## final value 9.645304
## converged
## # weights: 83
## initial value 40.865443
## iter 10 value 24.451119
## iter 20 value 7.701496
## iter 30 value 7.564427
## iter 40 value 7.563745
## iter 50 value 7.563664
## final value 7.563662
## converged
## # weights: 124
## initial value 39.793548
## iter 10 value 24.008245
## iter 20 value 11.701516
## iter 30 value 11.421202
## iter 40 value 11.407686
## iter 50 value 11.403961
## iter 60 value 11.403738
## iter 70 value 11.403705
## iter 80 value 11.403527
## final value 11.403526
## converged
## # weights: 165
## initial value 38.076318
## iter 10 value 22.377603
## iter 20 value 3.953223
## iter 30 value 0.039332
## iter 40 value 0.001438
## final value 0.000049
## converged
## # weights: 206
## initial value 41.456225
## iter 10 value 17.738450
## iter 20 value 4.442076
## iter 30 value 3.953717
## iter 40 value 3.764614
## iter 50 value 3.640465
## iter 60 value 3.633976
## iter 70 value 1.456619
## iter 80 value 1.407404
## iter 90 value 1.390347
## iter 100 value 1.386809
## final value 1.386809
## stopped after 100 iterations
## # weights: 42
## initial value 42.903201
## iter 10 value 36.084512
## iter 20 value 31.776249
## iter 30 value 31.728425
## final value 31.728253
## converged
## # weights: 83
## initial value 49.933407
## iter 10 value 34.858093
## iter 20 value 31.595276
## iter 30 value 29.684070
## iter 40 value 29.442436
## iter 50 value 29.441678
## final value 29.441674
## converged
## # weights: 124
## initial value 59.243299
## iter 10 value 33.343694
## iter 20 value 29.292536
## iter 30 value 28.969588
## iter 40 value 28.892957
## iter 50 value 28.890646
## iter 60 value 28.890512
## final value 28.890511
## converged
## # weights: 165
## initial value 65.436414
## iter 10 value 33.968407
## iter 20 value 29.725012
## iter 30 value 28.718684
## iter 40 value 28.366731
## iter 50 value 28.348157
## iter 60 value 28.347200
## final value 28.347199
## converged
## # weights: 206
## initial value 74.187729
## iter 10 value 33.723062
## iter 20 value 28.622307
## iter 30 value 28.335937
## iter 40 value 28.162297
## iter 50 value 28.154444
## iter 60 value 28.153724
## final value 28.153721
## converged
## # weights: 83
## initial value 87.504295
## iter 10 value 55.605142
## iter 20 value 50.715555
## iter 30 value 42.439019
## iter 40 value 42.146910
## iter 50 value 41.654480
## iter 60 value 41.424125
## iter 70 value 41.423215
## final value 41.423095
## converged
tictoc::toc()
## 4.85 sec elapsed
# Train control with random search
fitControl <- caret::trainControl(method = "repeatedcv", number = 3, repeats = 5, search = "random")
# Test 6 random hyperparameter combinations
tictoc::tic()
nn_model_voters_big_grid <- caret::train(turnout16_2016 ~ ., data = voters_train_data, method = "nnet",
trControl = fitControl, verbose = FALSE, tuneLength = 6
)
## # weights: 780
## initial value 42.095991
## iter 10 value 11.477256
## iter 20 value 0.192048
## iter 30 value 0.139685
## iter 40 value 0.115191
## iter 50 value 0.094285
## iter 60 value 0.078501
## iter 70 value 0.072942
## iter 80 value 0.068058
## iter 90 value 0.065069
## iter 100 value 0.060509
## final value 0.060509
## stopped after 100 iterations
## # weights: 42
## initial value 59.042295
## iter 10 value 37.432959
## iter 20 value 37.420418
## final value 37.420383
## converged
## # weights: 452
## initial value 553.839156
## iter 10 value 56.555697
## iter 20 value 37.948157
## iter 30 value 37.459174
## iter 40 value 37.433606
## iter 50 value 37.430055
## iter 60 value 37.429948
## iter 60 value 37.429948
## iter 60 value 37.429948
## final value 37.429948
## converged
## # weights: 83
## initial value 78.146350
## iter 10 value 37.402337
## iter 20 value 37.302006
## iter 30 value 37.301722
## final value 37.301721
## converged
## # weights: 165
## initial value 37.604154
## iter 10 value 15.979035
## iter 20 value 7.272046
## iter 30 value 3.830592
## iter 40 value 1.933663
## iter 50 value 0.387161
## iter 60 value 0.341547
## iter 70 value 0.284280
## iter 80 value 0.249284
## iter 90 value 0.200176
## iter 100 value 0.163334
## final value 0.163334
## stopped after 100 iterations
## # weights: 616
## initial value 44.262407
## iter 10 value 18.091032
## iter 20 value 7.992399
## iter 30 value 3.862796
## iter 40 value 2.793425
## iter 50 value 2.491309
## iter 60 value 2.376890
## iter 70 value 2.306483
## iter 80 value 2.279268
## iter 90 value 2.238432
## iter 100 value 2.216990
## final value 2.216990
## stopped after 100 iterations
## # weights: 780
## initial value 37.722928
## iter 10 value 15.819151
## iter 20 value 1.354592
## iter 30 value 0.210547
## iter 40 value 0.144478
## iter 50 value 0.134844
## iter 60 value 0.122631
## iter 70 value 0.101942
## iter 80 value 0.091843
## iter 90 value 0.081159
## iter 100 value 0.072211
## final value 0.072211
## stopped after 100 iterations
## # weights: 42
## initial value 61.134041
## iter 10 value 35.945541
## iter 20 value 35.812616
## final value 35.812590
## converged
## # weights: 452
## initial value 585.216109
## iter 10 value 38.436536
## iter 20 value 36.137038
## iter 30 value 36.047457
## iter 40 value 36.043713
## final value 36.043653
## converged
## # weights: 83
## initial value 79.438600
## iter 10 value 35.704312
## iter 20 value 35.354481
## iter 30 value 35.352628
## final value 35.352627
## converged
## # weights: 165
## initial value 36.013564
## iter 10 value 17.015511
## iter 20 value 5.273321
## iter 30 value 2.992541
## iter 40 value 2.951417
## iter 50 value 2.819570
## iter 60 value 0.724280
## iter 70 value 0.270301
## iter 80 value 0.231933
## iter 90 value 0.203429
## iter 100 value 0.163522
## final value 0.163522
## stopped after 100 iterations
## # weights: 616
## initial value 47.992166
## iter 10 value 16.954806
## iter 20 value 6.644326
## iter 30 value 3.594294
## iter 40 value 2.691986
## iter 50 value 2.467660
## iter 60 value 2.333128
## iter 70 value 2.257916
## iter 80 value 2.215114
## iter 90 value 2.193699
## iter 100 value 2.185256
## final value 2.185256
## stopped after 100 iterations
## # weights: 780
## initial value 39.163505
## iter 10 value 10.052538
## iter 20 value 0.224281
## iter 30 value 0.134141
## iter 40 value 0.103848
## iter 50 value 0.081907
## iter 60 value 0.070483
## iter 70 value 0.062302
## iter 80 value 0.056113
## iter 90 value 0.050826
## iter 100 value 0.047241
## final value 0.047241
## stopped after 100 iterations
## # weights: 42
## initial value 63.990222
## iter 10 value 37.426187
## iter 20 value 37.303877
## final value 37.303437
## converged
## # weights: 452
## initial value 575.470315
## iter 10 value 43.960207
## iter 20 value 37.619610
## iter 30 value 37.433908
## iter 40 value 37.430696
## iter 50 value 37.429952
## final value 37.429948
## converged
## # weights: 83
## initial value 83.195204
## iter 10 value 37.901089
## iter 20 value 37.044286
## iter 30 value 36.985201
## final value 36.985143
## converged
## # weights: 165
## initial value 39.061368
## iter 10 value 22.181704
## iter 20 value 10.718664
## iter 30 value 5.223358
## iter 40 value 2.843380
## iter 50 value 2.548575
## iter 60 value 2.513813
## iter 70 value 2.468511
## iter 80 value 2.162969
## iter 90 value 1.713318
## iter 100 value 0.476445
## final value 0.476445
## stopped after 100 iterations
## # weights: 616
## initial value 39.310513
## iter 10 value 21.235338
## iter 20 value 7.409032
## iter 30 value 3.707935
## iter 40 value 2.933656
## iter 50 value 2.589250
## iter 60 value 2.434121
## iter 70 value 2.307593
## iter 80 value 2.245299
## iter 90 value 2.215206
## iter 100 value 2.195757
## final value 2.195757
## stopped after 100 iterations
## # weights: 780
## initial value 45.390625
## iter 10 value 1.260303
## iter 20 value 0.111025
## iter 30 value 0.096680
## iter 40 value 0.085225
## iter 50 value 0.067893
## iter 60 value 0.056329
## iter 70 value 0.050509
## iter 80 value 0.045782
## iter 90 value 0.042122
## iter 100 value 0.039743
## final value 0.039743
## stopped after 100 iterations
## # weights: 42
## initial value 63.371226
## iter 10 value 36.791974
## iter 20 value 36.691666
## iter 30 value 36.641830
## final value 36.641786
## converged
## # weights: 452
## initial value 602.935601
## iter 10 value 39.338365
## iter 20 value 36.775330
## iter 30 value 36.729462
## iter 40 value 36.729128
## final value 36.729122
## converged
## # weights: 83
## initial value 102.375607
## iter 10 value 36.798850
## iter 20 value 36.429710
## iter 30 value 36.427117
## final value 36.427116
## converged
## # weights: 165
## initial value 36.625031
## iter 10 value 14.846930
## iter 20 value 1.990558
## iter 30 value 0.333866
## iter 40 value 0.294679
## iter 50 value 0.262435
## iter 60 value 0.231028
## iter 70 value 0.207296
## iter 80 value 0.186618
## iter 90 value 0.161398
## iter 100 value 0.124395
## final value 0.124395
## stopped after 100 iterations
## # weights: 616
## initial value 45.637851
## iter 10 value 12.687417
## iter 20 value 4.155737
## iter 30 value 2.378098
## iter 40 value 2.013474
## iter 50 value 1.878404
## iter 60 value 1.812370
## iter 70 value 1.780197
## iter 80 value 1.770715
## iter 90 value 1.764248
## iter 100 value 1.758322
## final value 1.758322
## stopped after 100 iterations
## # weights: 780
## initial value 42.819396
## iter 10 value 12.987554
## iter 20 value 0.417699
## iter 30 value 0.268359
## iter 40 value 0.163308
## iter 50 value 0.121675
## iter 60 value 0.102754
## iter 70 value 0.094386
## iter 80 value 0.089171
## iter 90 value 0.081589
## iter 100 value 0.077903
## final value 0.077903
## stopped after 100 iterations
## # weights: 42
## initial value 59.457067
## iter 10 value 36.719317
## iter 20 value 36.512862
## final value 36.512485
## converged
## # weights: 452
## initial value 641.178198
## iter 10 value 38.317632
## iter 20 value 36.843606
## iter 30 value 36.735746
## iter 40 value 36.729089
## iter 50 value 36.728877
## final value 36.728877
## converged
## # weights: 83
## initial value 74.352054
## iter 10 value 36.462081
## iter 20 value 36.160669
## final value 36.159809
## converged
## # weights: 165
## initial value 38.662104
## iter 10 value 12.629428
## iter 20 value 6.705331
## iter 30 value 5.860436
## iter 40 value 4.651619
## iter 50 value 4.537701
## iter 60 value 2.872848
## iter 70 value 0.706095
## iter 80 value 0.453485
## iter 90 value 0.364220
## iter 100 value 0.314568
## final value 0.314568
## stopped after 100 iterations
## # weights: 616
## initial value 40.009335
## iter 10 value 14.087456
## iter 20 value 5.617005
## iter 30 value 3.152658
## iter 40 value 2.513647
## iter 50 value 2.353026
## iter 60 value 2.306978
## iter 70 value 2.266536
## iter 80 value 2.239816
## iter 90 value 2.228624
## iter 100 value 2.218696
## final value 2.218696
## stopped after 100 iterations
## # weights: 780
## initial value 39.005315
## iter 10 value 14.769652
## iter 20 value 1.739485
## iter 30 value 0.238076
## iter 40 value 0.186497
## iter 50 value 0.150203
## iter 60 value 0.113830
## iter 70 value 0.096412
## iter 80 value 0.083231
## iter 90 value 0.074761
## iter 100 value 0.069245
## final value 0.069245
## stopped after 100 iterations
## # weights: 42
## initial value 55.825909
## iter 10 value 37.452869
## iter 20 value 37.332664
## iter 30 value 37.328823
## final value 37.328821
## converged
## # weights: 452
## initial value 573.254838
## iter 10 value 38.786598
## iter 20 value 37.459129
## iter 30 value 37.431030
## iter 40 value 37.429958
## final value 37.429948
## converged
## # weights: 83
## initial value 79.483393
## iter 10 value 37.477852
## iter 20 value 37.050537
## iter 30 value 37.049302
## final value 37.049265
## converged
## # weights: 165
## initial value 48.578017
## iter 10 value 29.702696
## iter 20 value 16.419372
## iter 30 value 14.992210
## iter 40 value 14.075778
## iter 50 value 13.281265
## iter 60 value 11.341777
## iter 70 value 10.877987
## iter 80 value 10.792720
## iter 90 value 10.374044
## iter 100 value 9.186177
## final value 9.186177
## stopped after 100 iterations
## # weights: 616
## initial value 62.705898
## iter 10 value 15.210731
## iter 20 value 6.657819
## iter 30 value 3.558933
## iter 40 value 2.588783
## iter 50 value 2.412344
## iter 60 value 2.339109
## iter 70 value 2.252255
## iter 80 value 2.203906
## iter 90 value 2.168884
## iter 100 value 2.149394
## final value 2.149394
## stopped after 100 iterations
## # weights: 780
## initial value 40.292120
## iter 10 value 14.246517
## iter 20 value 2.989623
## iter 30 value 0.240685
## iter 40 value 0.200952
## iter 50 value 0.134743
## iter 60 value 0.098611
## iter 70 value 0.078535
## iter 80 value 0.070926
## iter 90 value 0.062139
## iter 100 value 0.055763
## final value 0.055763
## stopped after 100 iterations
## # weights: 42
## initial value 59.897852
## iter 10 value 36.814750
## iter 20 value 36.681968
## iter 30 value 36.422000
## iter 40 value 36.421020
## iter 40 value 36.421020
## iter 40 value 36.421020
## final value 36.421020
## converged
## # weights: 452
## initial value 610.498144
## iter 10 value 48.291560
## iter 20 value 37.874822
## iter 30 value 36.769871
## iter 40 value 36.732825
## iter 50 value 36.729117
## iter 60 value 36.728756
## iter 60 value 36.728756
## iter 60 value 36.728756
## final value 36.728756
## converged
## # weights: 83
## initial value 79.112134
## iter 10 value 37.170455
## iter 20 value 35.844240
## iter 30 value 35.771595
## final value 35.771550
## converged
## # weights: 165
## initial value 48.479803
## iter 10 value 26.887564
## iter 20 value 14.959194
## iter 30 value 11.956342
## iter 40 value 11.930216
## iter 50 value 11.897640
## iter 60 value 11.879147
## iter 70 value 11.739201
## iter 80 value 9.742511
## iter 90 value 9.725574
## iter 100 value 9.639212
## final value 9.639212
## stopped after 100 iterations
## # weights: 616
## initial value 39.624363
## iter 10 value 17.050045
## iter 20 value 4.846687
## iter 30 value 2.799602
## iter 40 value 2.309892
## iter 50 value 2.145687
## iter 60 value 2.062018
## iter 70 value 2.014823
## iter 80 value 1.989463
## iter 90 value 1.977966
## iter 100 value 1.972199
## final value 1.972199
## stopped after 100 iterations
## # weights: 780
## initial value 40.977390
## iter 10 value 9.840887
## iter 20 value 1.011521
## iter 30 value 0.246611
## iter 40 value 0.188889
## iter 50 value 0.138214
## iter 60 value 0.118413
## iter 70 value 0.099929
## iter 80 value 0.082123
## iter 90 value 0.072491
## iter 100 value 0.066546
## final value 0.066546
## stopped after 100 iterations
## # weights: 42
## initial value 58.763565
## iter 10 value 36.715493
## iter 20 value 36.709613
## final value 36.709611
## converged
## # weights: 452
## initial value 670.925860
## iter 10 value 64.326356
## iter 20 value 37.073699
## iter 30 value 36.738858
## iter 40 value 36.730563
## iter 50 value 36.729230
## final value 36.729224
## converged
## # weights: 83
## initial value 76.191688
## iter 10 value 36.708437
## iter 20 value 36.677110
## iter 30 value 36.635095
## final value 36.634998
## converged
## # weights: 165
## initial value 38.448315
## iter 10 value 36.746551
## iter 20 value 36.525825
## iter 30 value 16.480157
## iter 40 value 14.116140
## iter 50 value 12.029902
## iter 60 value 11.884332
## iter 70 value 11.874378
## iter 80 value 11.866986
## iter 90 value 11.629927
## iter 100 value 2.017779
## final value 2.017779
## stopped after 100 iterations
## # weights: 616
## initial value 40.134269
## iter 10 value 13.293328
## iter 20 value 4.957187
## iter 30 value 3.095058
## iter 40 value 2.538831
## iter 50 value 2.379299
## iter 60 value 2.303894
## iter 70 value 2.259849
## iter 80 value 2.232007
## iter 90 value 2.218373
## iter 100 value 2.206641
## final value 2.206641
## stopped after 100 iterations
## # weights: 780
## initial value 50.393083
## iter 10 value 10.808405
## iter 20 value 3.142951
## iter 30 value 0.978622
## iter 40 value 0.205693
## iter 50 value 0.187392
## iter 60 value 0.165499
## iter 70 value 0.136085
## iter 80 value 0.120482
## iter 90 value 0.102364
## iter 100 value 0.090728
## final value 0.090728
## stopped after 100 iterations
## # weights: 42
## initial value 56.129598
## iter 10 value 37.429995
## iter 20 value 37.341543
## final value 37.340761
## converged
## # weights: 452
## initial value 580.494720
## iter 10 value 41.243636
## iter 20 value 37.529179
## iter 30 value 37.435238
## iter 40 value 37.430108
## final value 37.429948
## converged
## # weights: 83
## initial value 76.715138
## iter 10 value 37.414601
## iter 20 value 37.209138
## iter 30 value 37.093629
## final value 37.093595
## converged
## # weights: 165
## initial value 36.739867
## iter 10 value 18.895476
## iter 20 value 2.836283
## iter 30 value 2.433627
## iter 40 value 0.599902
## iter 50 value 0.412432
## iter 60 value 0.373373
## iter 70 value 0.310848
## iter 80 value 0.236368
## iter 90 value 0.207126
## iter 100 value 0.194429
## final value 0.194429
## stopped after 100 iterations
## # weights: 616
## initial value 39.182652
## iter 10 value 13.391480
## iter 20 value 5.361618
## iter 30 value 2.914686
## iter 40 value 2.591341
## iter 50 value 2.460647
## iter 60 value 2.374053
## iter 70 value 2.332764
## iter 80 value 2.295253
## iter 90 value 2.278175
## iter 100 value 2.271961
## final value 2.271961
## stopped after 100 iterations
## # weights: 780
## initial value 37.461655
## iter 10 value 16.307209
## iter 20 value 0.470690
## iter 30 value 0.147915
## iter 40 value 0.121580
## iter 50 value 0.099273
## iter 60 value 0.086040
## iter 70 value 0.078263
## iter 80 value 0.071824
## iter 90 value 0.066887
## iter 100 value 0.061589
## final value 0.061589
## stopped after 100 iterations
## # weights: 42
## initial value 59.536250
## iter 10 value 36.766669
## iter 20 value 36.731188
## iter 30 value 36.716264
## final value 36.716235
## converged
## # weights: 452
## initial value 603.217414
## iter 10 value 50.097127
## iter 20 value 37.369234
## iter 30 value 36.741135
## iter 40 value 36.730330
## iter 50 value 36.729249
## final value 36.729222
## converged
## # weights: 83
## initial value 78.919273
## iter 10 value 36.785511
## iter 20 value 36.714335
## iter 30 value 36.674450
## iter 40 value 36.665749
## final value 36.665741
## converged
## # weights: 165
## initial value 45.265427
## iter 10 value 27.334581
## iter 20 value 17.481668
## iter 30 value 15.592648
## iter 40 value 11.155156
## iter 50 value 4.904583
## iter 60 value 3.921931
## iter 70 value 3.116526
## iter 80 value 3.028310
## iter 90 value 3.013188
## iter 100 value 2.972920
## final value 2.972920
## stopped after 100 iterations
## # weights: 616
## initial value 58.270668
## iter 10 value 22.796252
## iter 20 value 9.279852
## iter 30 value 5.206134
## iter 40 value 3.811898
## iter 50 value 3.244092
## iter 60 value 3.074384
## iter 70 value 3.004659
## iter 80 value 2.960947
## iter 90 value 2.922234
## iter 100 value 2.897032
## final value 2.897032
## stopped after 100 iterations
## # weights: 780
## initial value 41.376552
## iter 10 value 14.468082
## iter 20 value 0.351423
## iter 30 value 0.137733
## iter 40 value 0.112835
## iter 50 value 0.089230
## iter 60 value 0.075916
## iter 70 value 0.068009
## iter 80 value 0.060238
## iter 90 value 0.054936
## iter 100 value 0.049549
## final value 0.049549
## stopped after 100 iterations
## # weights: 42
## initial value 64.261380
## iter 10 value 36.415336
## iter 20 value 36.350419
## final value 36.350404
## converged
## # weights: 452
## initial value 575.285191
## iter 10 value 45.325764
## iter 20 value 38.035560
## iter 30 value 36.774493
## iter 40 value 36.733352
## iter 50 value 36.728795
## final value 36.728681
## converged
## # weights: 83
## initial value 75.229765
## iter 10 value 36.378772
## iter 20 value 35.660524
## final value 35.659431
## converged
## # weights: 165
## initial value 42.292548
## iter 10 value 20.652713
## iter 20 value 7.562533
## iter 30 value 4.414666
## iter 40 value 4.389132
## iter 50 value 4.380923
## iter 60 value 4.363783
## iter 70 value 4.354804
## iter 80 value 4.349484
## iter 90 value 4.347580
## iter 100 value 4.346070
## final value 4.346070
## stopped after 100 iterations
## # weights: 616
## initial value 51.161062
## iter 10 value 13.850106
## iter 20 value 5.969366
## iter 30 value 3.075719
## iter 40 value 2.350861
## iter 50 value 2.082022
## iter 60 value 1.934655
## iter 70 value 1.884197
## iter 80 value 1.865850
## iter 90 value 1.854994
## iter 100 value 1.838942
## final value 1.838942
## stopped after 100 iterations
## # weights: 780
## initial value 43.836819
## iter 10 value 20.071372
## iter 20 value 7.983087
## iter 30 value 2.659602
## iter 40 value 2.597720
## iter 50 value 1.908211
## iter 60 value 0.290933
## iter 70 value 0.254480
## iter 80 value 0.233417
## iter 90 value 0.205674
## iter 100 value 0.175149
## final value 0.175149
## stopped after 100 iterations
## # weights: 42
## initial value 58.025602
## iter 10 value 37.371499
## iter 20 value 37.315864
## final value 37.315229
## converged
## # weights: 452
## initial value 609.672321
## iter 10 value 39.228104
## iter 20 value 37.523704
## iter 30 value 37.434036
## iter 40 value 37.429962
## final value 37.429948
## converged
## # weights: 83
## initial value 78.301878
## iter 10 value 37.420793
## iter 20 value 37.288175
## iter 30 value 37.284949
## final value 37.284781
## converged
## # weights: 165
## initial value 46.736831
## iter 10 value 11.433514
## iter 20 value 1.717475
## iter 30 value 0.281743
## iter 40 value 0.254186
## iter 50 value 0.216932
## iter 60 value 0.188539
## iter 70 value 0.170083
## iter 80 value 0.158696
## iter 90 value 0.148639
## iter 100 value 0.137387
## final value 0.137387
## stopped after 100 iterations
## # weights: 616
## initial value 37.445786
## iter 10 value 12.560420
## iter 20 value 6.701402
## iter 30 value 3.593111
## iter 40 value 2.797682
## iter 50 value 2.421579
## iter 60 value 2.316518
## iter 70 value 2.257755
## iter 80 value 2.230315
## iter 90 value 2.219032
## iter 100 value 2.214059
## final value 2.214059
## stopped after 100 iterations
## # weights: 780
## initial value 41.840738
## iter 10 value 16.434277
## iter 20 value 6.488681
## iter 30 value 1.876942
## iter 40 value 0.765006
## iter 50 value 0.291335
## iter 60 value 0.269471
## iter 70 value 0.232827
## iter 80 value 0.208372
## iter 90 value 0.193377
## iter 100 value 0.167532
## final value 0.167532
## stopped after 100 iterations
## # weights: 42
## initial value 66.116434
## iter 10 value 38.314290
## iter 20 value 37.356091
## iter 30 value 37.332756
## final value 37.332748
## converged
## # weights: 452
## initial value 573.463639
## iter 10 value 48.728211
## iter 20 value 38.100876
## iter 30 value 37.467570
## iter 40 value 37.432047
## iter 50 value 37.430094
## final value 37.429948
## converged
## # weights: 83
## initial value 92.675417
## iter 10 value 37.331134
## iter 20 value 37.071593
## iter 30 value 37.063970
## final value 37.063966
## converged
## # weights: 165
## initial value 49.903378
## iter 10 value 34.376922
## iter 20 value 16.148617
## iter 30 value 13.953906
## iter 40 value 13.932766
## iter 50 value 13.920510
## iter 60 value 13.910534
## iter 70 value 12.813669
## iter 80 value 7.528844
## iter 90 value 5.982163
## iter 100 value 5.961699
## final value 5.961699
## stopped after 100 iterations
## # weights: 616
## initial value 36.471378
## iter 10 value 14.887129
## iter 20 value 5.583078
## iter 30 value 3.294750
## iter 40 value 2.688810
## iter 50 value 2.525001
## iter 60 value 2.441646
## iter 70 value 2.392923
## iter 80 value 2.377739
## iter 90 value 2.366463
## iter 100 value 2.357317
## final value 2.357317
## stopped after 100 iterations
## # weights: 780
## initial value 46.061865
## iter 10 value 10.661556
## iter 20 value 0.336436
## iter 30 value 0.131546
## iter 40 value 0.100317
## iter 50 value 0.078902
## iter 60 value 0.068074
## iter 70 value 0.061313
## iter 80 value 0.055716
## iter 90 value 0.050316
## iter 100 value 0.045996
## final value 0.045996
## stopped after 100 iterations
## # weights: 42
## initial value 56.415398
## iter 10 value 37.344913
## iter 20 value 37.206354
## final value 37.206239
## converged
## # weights: 452
## initial value 624.089273
## iter 10 value 43.671775
## iter 20 value 37.869603
## iter 30 value 37.458436
## iter 40 value 37.430260
## iter 50 value 37.429948
## iter 50 value 37.429948
## iter 50 value 37.429948
## final value 37.429948
## converged
## # weights: 83
## initial value 78.974910
## iter 10 value 37.195116
## iter 20 value 37.159180
## iter 30 value 36.984115
## iter 40 value 36.731293
## final value 36.730514
## converged
## # weights: 165
## initial value 38.209351
## iter 10 value 25.539625
## iter 20 value 13.896425
## iter 30 value 13.567873
## iter 40 value 12.973842
## iter 50 value 12.764897
## iter 60 value 11.392946
## iter 70 value 9.747052
## iter 80 value 8.266110
## iter 90 value 5.722719
## iter 100 value 4.131324
## final value 4.131324
## stopped after 100 iterations
## # weights: 616
## initial value 48.017027
## iter 10 value 17.883304
## iter 20 value 7.111944
## iter 30 value 3.360980
## iter 40 value 2.644175
## iter 50 value 2.287542
## iter 60 value 2.157949
## iter 70 value 2.120883
## iter 80 value 2.094691
## iter 90 value 2.081399
## iter 100 value 2.070873
## final value 2.070873
## stopped after 100 iterations
## # weights: 780
## initial value 50.236212
## iter 10 value 15.843816
## iter 20 value 8.465304
## iter 30 value 8.338156
## iter 40 value 8.108938
## iter 50 value 4.070357
## iter 60 value 1.649383
## iter 70 value 1.600907
## iter 80 value 1.571623
## iter 90 value 1.542905
## iter 100 value 1.515006
## final value 1.515006
## stopped after 100 iterations
## # weights: 42
## initial value 64.847866
## iter 10 value 35.985667
## iter 20 value 35.971761
## final value 35.971755
## converged
## # weights: 452
## initial value 626.691028
## iter 10 value 40.403666
## iter 20 value 36.110199
## iter 30 value 36.045113
## iter 40 value 36.043671
## final value 36.043653
## converged
## # weights: 83
## initial value 78.174070
## iter 10 value 37.042184
## iter 20 value 35.800497
## iter 30 value 35.742156
## iter 40 value 35.741814
## iter 40 value 35.741813
## iter 40 value 35.741813
## final value 35.741813
## converged
## # weights: 165
## initial value 35.724099
## iter 10 value 15.567152
## iter 20 value 2.438302
## iter 30 value 0.423007
## iter 40 value 0.350050
## iter 50 value 0.298933
## iter 60 value 0.236935
## iter 70 value 0.199693
## iter 80 value 0.162711
## iter 90 value 0.133477
## iter 100 value 0.113916
## final value 0.113916
## stopped after 100 iterations
## # weights: 616
## initial value 55.016857
## iter 10 value 16.032075
## iter 20 value 5.700726
## iter 30 value 2.723938
## iter 40 value 2.283463
## iter 50 value 2.129382
## iter 60 value 2.028967
## iter 70 value 1.997222
## iter 80 value 1.989880
## iter 90 value 1.985893
## iter 100 value 1.980308
## final value 1.980308
## stopped after 100 iterations
## # weights: 83
## initial value 95.734274
## iter 10 value 54.917833
## iter 20 value 54.369808
## iter 30 value 54.179707
## iter 40 value 54.154820
## final value 54.154816
## converged
tictoc::toc()
## 10.8 sec elapsed
# Define trainControl function
fitControl <- caret::trainControl(method="adaptive_cv", number = 3, repeats = 3)
# Define trainControl function
fitControl <- caret::trainControl(method = "adaptive_cv", number = 3, repeats = 3, search="random")
# Define trainControl function
fitControl <- caret::trainControl(method = "adaptive_cv", number = 3, repeats = 3,
adaptive = list(min=3, alpha = 0.05, method = "BT", complete = FALSE),
search = "random"
)
# Start timer & train model
tictoc::tic()
svm_model_voters_ar <- caret::train(turnout16_2016 ~ ., data = voters_train_data, method = "nnet",
trControl = fitControl, verbose = FALSE, tuneLength = 6
)
##
## Attaching package: 'nnet'
## The following object is masked from 'package:mgcv':
##
## multinom
## # weights: 124
## initial value 44.275967
## iter 10 value 36.440463
## iter 20 value 23.082035
## iter 30 value 21.964016
## iter 40 value 21.761035
## iter 50 value 21.755784
## iter 60 value 21.753926
## iter 70 value 21.753573
## final value 21.753573
## converged
## # weights: 370
## initial value 37.951128
## iter 10 value 16.361507
## iter 20 value 5.417270
## iter 30 value 2.349817
## iter 40 value 2.274324
## iter 50 value 2.233795
## iter 60 value 2.182737
## iter 70 value 2.142867
## iter 80 value 2.102816
## iter 90 value 2.018446
## iter 100 value 1.160599
## final value 1.160599
## stopped after 100 iterations
## # weights: 657
## initial value 498.973531
## iter 10 value 64.312510
## iter 20 value 37.068441
## iter 30 value 36.755749
## iter 40 value 36.737270
## iter 50 value 36.729396
## iter 60 value 36.727778
## final value 36.727700
## converged
## # weights: 534
## initial value 38.918257
## iter 10 value 22.265881
## iter 20 value 5.040982
## iter 30 value 2.795378
## iter 40 value 2.204134
## iter 50 value 1.005775
## iter 60 value 0.753512
## iter 70 value 0.577415
## iter 80 value 0.499433
## iter 90 value 0.460771
## iter 100 value 0.431809
## final value 0.431809
## stopped after 100 iterations
## # weights: 698
## initial value 42.196882
## iter 10 value 23.258456
## iter 20 value 9.731200
## iter 30 value 4.632739
## iter 40 value 2.811316
## iter 50 value 1.774158
## iter 60 value 1.593068
## iter 70 value 1.527281
## iter 80 value 1.453013
## iter 90 value 1.377536
## iter 100 value 1.343026
## final value 1.343026
## stopped after 100 iterations
## # weights: 739
## initial value 80.972030
## iter 10 value 13.258808
## iter 20 value 2.454603
## iter 30 value 1.194309
## iter 40 value 0.828264
## iter 50 value 0.684927
## iter 60 value 0.623850
## iter 70 value 0.599861
## iter 80 value 0.585994
## iter 90 value 0.570180
## iter 100 value 0.560652
## final value 0.560652
## stopped after 100 iterations
## # weights: 124
## initial value 44.588322
## iter 10 value 34.893865
## iter 20 value 24.320215
## iter 30 value 18.790891
## iter 40 value 18.303341
## iter 50 value 18.224331
## iter 60 value 18.213048
## iter 70 value 18.207502
## final value 18.207496
## converged
## # weights: 370
## initial value 47.570546
## iter 10 value 16.794622
## iter 20 value 7.331124
## iter 30 value 0.339760
## iter 40 value 0.223463
## iter 50 value 0.186962
## iter 60 value 0.167297
## iter 70 value 0.157758
## iter 80 value 0.153938
## iter 90 value 0.149573
## iter 100 value 0.143599
## final value 0.143599
## stopped after 100 iterations
## # weights: 657
## initial value 518.008237
## iter 10 value 42.345207
## iter 20 value 37.785224
## iter 30 value 36.768585
## iter 40 value 36.729057
## iter 50 value 36.725311
## iter 60 value 36.724471
## iter 70 value 36.724320
## iter 80 value 36.724246
## final value 36.724228
## converged
## # weights: 534
## initial value 35.907225
## iter 10 value 16.364251
## iter 20 value 0.686190
## iter 30 value 0.485076
## iter 40 value 0.358021
## iter 50 value 0.313532
## iter 60 value 0.294566
## iter 70 value 0.285578
## iter 80 value 0.279374
## iter 90 value 0.271242
## iter 100 value 0.264507
## final value 0.264507
## stopped after 100 iterations
## # weights: 698
## initial value 38.192191
## iter 10 value 12.833944
## iter 20 value 4.433109
## iter 30 value 2.191992
## iter 40 value 1.566489
## iter 50 value 1.251073
## iter 60 value 1.126693
## iter 70 value 1.080244
## iter 80 value 1.052277
## iter 90 value 1.030704
## iter 100 value 1.020197
## final value 1.020197
## stopped after 100 iterations
## # weights: 739
## initial value 56.633001
## iter 10 value 14.671303
## iter 20 value 3.914868
## iter 30 value 1.794527
## iter 40 value 1.204729
## iter 50 value 0.768333
## iter 60 value 0.657568
## iter 70 value 0.565811
## iter 80 value 0.489853
## iter 90 value 0.448023
## iter 100 value 0.424290
## final value 0.424290
## stopped after 100 iterations
## # weights: 124
## initial value 46.134071
## iter 10 value 27.182976
## iter 20 value 17.759840
## iter 30 value 16.349265
## iter 40 value 16.041055
## iter 50 value 16.032849
## final value 16.032839
## converged
## # weights: 370
## initial value 56.619136
## iter 10 value 13.147020
## iter 20 value 4.516580
## iter 30 value 4.423254
## iter 40 value 0.255075
## iter 50 value 0.141184
## iter 60 value 0.116661
## iter 70 value 0.100472
## iter 80 value 0.094996
## iter 90 value 0.084593
## iter 100 value 0.074915
## final value 0.074915
## stopped after 100 iterations
## # weights: 657
## initial value 542.718439
## iter 10 value 41.754015
## iter 20 value 37.258986
## iter 30 value 37.076908
## iter 40 value 37.052470
## iter 50 value 37.049151
## iter 60 value 37.048067
## iter 70 value 37.047686
## iter 80 value 37.047559
## iter 80 value 37.047559
## iter 80 value 37.047559
## final value 37.047559
## converged
## # weights: 534
## initial value 43.062723
## iter 10 value 10.603639
## iter 20 value 0.642486
## iter 30 value 0.468601
## iter 40 value 0.333013
## iter 50 value 0.271886
## iter 60 value 0.249356
## iter 70 value 0.233858
## iter 80 value 0.216051
## iter 90 value 0.202799
## iter 100 value 0.197191
## final value 0.197191
## stopped after 100 iterations
## # weights: 698
## initial value 50.038605
## iter 10 value 13.446323
## iter 20 value 2.858862
## iter 30 value 1.591871
## iter 40 value 1.209430
## iter 50 value 1.041773
## iter 60 value 0.933315
## iter 70 value 0.884087
## iter 80 value 0.844380
## iter 90 value 0.807439
## iter 100 value 0.787761
## final value 0.787761
## stopped after 100 iterations
## # weights: 739
## initial value 45.066974
## iter 10 value 11.716800
## iter 20 value 3.408476
## iter 30 value 2.021217
## iter 40 value 1.051264
## iter 50 value 0.622654
## iter 60 value 0.457695
## iter 70 value 0.388328
## iter 80 value 0.352778
## iter 90 value 0.332328
## iter 100 value 0.321958
## final value 0.321958
## stopped after 100 iterations
## Loading required namespace: BradleyTerry2
## Warning in eval(family$initialize): non-integer counts in a binomial glm!
## # weights: 124
## initial value 52.061078
## iter 10 value 30.608033
## iter 20 value 23.645003
## iter 30 value 22.431176
## iter 40 value 20.342526
## iter 50 value 20.140440
## iter 60 value 20.038633
## iter 70 value 19.961808
## iter 80 value 19.531554
## iter 90 value 19.427643
## final value 19.424994
## converged
## # weights: 370
## initial value 38.288446
## iter 10 value 15.952413
## iter 20 value 8.451919
## iter 30 value 6.168426
## iter 40 value 6.098642
## iter 50 value 5.595655
## iter 60 value 4.433657
## iter 70 value 4.394847
## iter 80 value 3.606427
## iter 90 value 3.569006
## iter 100 value 3.172624
## final value 3.172624
## stopped after 100 iterations
## # weights: 657
## initial value 542.761320
## iter 10 value 47.830405
## iter 20 value 38.282873
## iter 30 value 36.871920
## iter 40 value 36.736721
## iter 50 value 36.716103
## iter 60 value 36.713709
## iter 70 value 36.712793
## iter 80 value 36.712613
## final value 36.712607
## converged
## # weights: 534
## initial value 38.010202
## iter 10 value 10.767306
## iter 20 value 6.393577
## iter 30 value 2.640509
## iter 40 value 1.051689
## iter 50 value 0.786490
## iter 60 value 0.575366
## iter 70 value 0.485198
## iter 80 value 0.424234
## iter 90 value 0.389048
## iter 100 value 0.362869
## final value 0.362869
## stopped after 100 iterations
## # weights: 698
## initial value 68.400422
## iter 10 value 15.020680
## iter 20 value 4.030552
## iter 30 value 2.259889
## iter 40 value 1.521236
## iter 50 value 1.350497
## iter 60 value 1.269584
## iter 70 value 1.226794
## iter 80 value 1.201477
## iter 90 value 1.184371
## iter 100 value 1.171294
## final value 1.171294
## stopped after 100 iterations
## # weights: 739
## initial value 37.948677
## iter 10 value 10.517782
## iter 20 value 1.164265
## iter 30 value 0.763354
## iter 40 value 0.574075
## iter 50 value 0.527088
## iter 60 value 0.507957
## iter 70 value 0.498783
## iter 80 value 0.492070
## iter 90 value 0.484097
## iter 100 value 0.476430
## final value 0.476430
## stopped after 100 iterations
## Warning in eval(family$initialize): non-integer counts in a binomial glm!
## # weights: 534
## initial value 41.150937
## iter 10 value 10.999895
## iter 20 value 1.063361
## iter 30 value 0.675099
## iter 40 value 0.512373
## iter 50 value 0.434325
## iter 60 value 0.399739
## iter 70 value 0.380227
## iter 80 value 0.368459
## iter 90 value 0.355592
## iter 100 value 0.346002
## final value 0.346002
## stopped after 100 iterations
## # weights: 698
## initial value 91.064393
## iter 10 value 20.089329
## iter 20 value 4.662915
## iter 30 value 2.081509
## iter 40 value 1.501076
## iter 50 value 1.384736
## iter 60 value 1.329129
## iter 70 value 1.286587
## iter 80 value 1.257335
## iter 90 value 1.231171
## iter 100 value 1.211073
## final value 1.211073
## stopped after 100 iterations
## # weights: 739
## initial value 44.933921
## iter 10 value 21.920467
## iter 20 value 6.097234
## iter 30 value 2.048703
## iter 40 value 1.122254
## iter 50 value 0.895622
## iter 60 value 0.785474
## iter 70 value 0.694390
## iter 80 value 0.613053
## iter 90 value 0.561664
## iter 100 value 0.533745
## final value 0.533745
## stopped after 100 iterations
## # weights: 124
## initial value 48.994143
## iter 10 value 33.574701
## iter 20 value 26.909900
## iter 30 value 26.001210
## iter 40 value 23.664357
## iter 50 value 21.164330
## iter 60 value 20.646224
## iter 70 value 20.614152
## iter 80 value 20.600616
## final value 20.600604
## converged
## Warning in eval(family$initialize): non-integer counts in a binomial glm!
## # weights: 534
## initial value 37.838625
## iter 10 value 15.678357
## iter 20 value 8.854845
## iter 30 value 6.251560
## iter 40 value 2.051770
## iter 50 value 1.437367
## iter 60 value 1.055765
## iter 70 value 0.824039
## iter 80 value 0.646349
## iter 90 value 0.519056
## iter 100 value 0.444675
## final value 0.444675
## stopped after 100 iterations
## # weights: 698
## initial value 52.272514
## iter 10 value 20.258588
## iter 20 value 9.110956
## iter 30 value 4.292440
## iter 40 value 2.182366
## iter 50 value 1.523808
## iter 60 value 1.315219
## iter 70 value 1.246812
## iter 80 value 1.212656
## iter 90 value 1.195883
## iter 100 value 1.187433
## final value 1.187433
## stopped after 100 iterations
## # weights: 124
## initial value 42.446506
## iter 10 value 24.577876
## iter 20 value 20.582477
## iter 30 value 19.437867
## iter 40 value 19.382912
## iter 50 value 19.378901
## final value 19.378884
## converged
## Warning in eval(family$initialize): non-integer counts in a binomial glm!
## # weights: 534
## initial value 42.326288
## iter 10 value 10.205576
## iter 20 value 3.252371
## iter 30 value 1.208405
## iter 40 value 0.884720
## iter 50 value 0.636052
## iter 60 value 0.509704
## iter 70 value 0.451380
## iter 80 value 0.391670
## iter 90 value 0.356216
## iter 100 value 0.331170
## final value 0.331170
## stopped after 100 iterations
## # weights: 698
## initial value 97.677119
## iter 10 value 16.622356
## iter 20 value 8.392211
## iter 30 value 4.057524
## iter 40 value 2.165973
## iter 50 value 1.641724
## iter 60 value 1.441719
## iter 70 value 1.338479
## iter 80 value 1.269219
## iter 90 value 1.216098
## iter 100 value 1.195676
## final value 1.195676
## stopped after 100 iterations
## # weights: 124
## initial value 45.207165
## iter 10 value 28.675421
## iter 20 value 22.506782
## iter 30 value 20.422020
## iter 40 value 19.712459
## iter 50 value 19.577957
## iter 60 value 19.577109
## final value 19.577093
## converged
## Warning in eval(family$initialize): non-integer counts in a binomial glm!
## # weights: 534
## initial value 37.014117
## iter 10 value 12.006555
## iter 20 value 3.416121
## iter 30 value 1.247261
## iter 40 value 0.720127
## iter 50 value 0.523285
## iter 60 value 0.443740
## iter 70 value 0.403475
## iter 80 value 0.370679
## iter 90 value 0.345999
## iter 100 value 0.331503
## final value 0.331503
## stopped after 100 iterations
## # weights: 698
## initial value 76.707941
## iter 10 value 19.093034
## iter 20 value 4.503402
## iter 30 value 2.394702
## iter 40 value 1.696850
## iter 50 value 1.490945
## iter 60 value 1.379735
## iter 70 value 1.317353
## iter 80 value 1.262156
## iter 90 value 1.233331
## iter 100 value 1.214273
## final value 1.214273
## stopped after 100 iterations
## Warning in eval(family$initialize): non-integer counts in a binomial glm!
## # weights: 534
## initial value 55.940776
## iter 10 value 21.982310
## iter 20 value 4.782215
## iter 30 value 3.249293
## iter 40 value 1.919848
## iter 50 value 0.945792
## iter 60 value 0.739518
## iter 70 value 0.607314
## iter 80 value 0.553013
## iter 90 value 0.529495
## iter 100 value 0.514585
## final value 0.514585
## stopped after 100 iterations
tictoc::toc()
## 8.97 sec elapsed
Chapter 3 - Hyperparameter Tuning with mlr
Machine Learning with mlr:
Grid and Random Search with mlr:
Evaluating Hyperparameters with mlr:
Advanced Tuning with mlr:
Example code includes:
vecData <- c(0.08, 0.18, 0.1, 0.12, 0.09, 0.08, 0.2, 0.2, 0.13, 0.18, 0.24, 0.18, 0.31, 0.28, 0.325, 0.323, 0.299, 0.32, 0.329, 0.315, 0.325, 0.325, 0.312, 0.299, 0.48, 0.46, 0.48, 0.49, 0.495, 0.43, 0.4, 0.44, 0.49, 0.44, 0.46, 0.495, 0.49, 0.42, 0.78, 0.85, 0.06, 0.08, 0.2, 0.06, 0.1, 0.15, 0.12, 0.06, 0.15, 0.1, 0.02, 0.09, 0.1, 0.08, 0.09, 0.2, 0.28, 0.265, 0.275, 0.295, 0.32, 0.25, 0.27, 0.27, 0.29, 0.288, 0.255, 0.295, 0.243, 0.295, 0.276, 0.258, 0.28, 0.255, 0.265, 0.255, 0.39, 0.38, 0.37, 0.38, 0.1, 0.1, 0.2, 0.18, 0.1, 0.12, 0.19, 0.14, 0.18, 0.17, 0.1, 0.23, 0.18, 0.2, 0.09, 0.06, 0.15, 0.29, 0.3, 0.27, 0.3, 0.295, 0.29, 0.258, 0.32, 0.3, 0.29, 0.26, 0.305, 0.32, 0.295, 0.285, 0.3, 0.4, 0.4, 0.41, 0.41, 0.44, 0.42, 0.43, 0.08, 0.18, 0.1, 0.12, 0.3, 0.325, 0.45, 0.49, 0.39, 0.34, 0.75, 0.51, 0.1, 0.16, 0.25, 0.32, 0.32, 0.28, 0.55, 0.69, 0.61, 0.9, 0.8, 0.7, 0.12, 0.2, 0.3, 0.245, 0.276, 0.45, 0.33, 0.33, 0.34, 0.55, 0.78, 0.82, 0.9, 0.7, 0.21, 0.05, 0.06, 0.08, 0.14, 0.06, 0.25, 0.32, 0.28, 0.29, 0.295, 0.42, 0.33, 0.55, 0.6, 0.58, 0.61, 0.68, 0.1, 0.06, 0.1, 0.2, 0.12, 0.29, 0.1, 0.31, 0.29, 0.31, 0.305, 0.25, 0.27, 0.29, 0.255, 0.31, 0.65, 0.75, 0.76, 0.72, 0.05, 0.1, 0.06, 0.01, 0.1, 0.1, 0.2, 0.3, 0.27, 0.245, 0.38, 0.49, 0.33, 0.36, 0.39, 0.7, 0.72, 0.52, 0.6, 0.77, 0.79, 0.06, 0.08, 0.12, 0.2, 0.25, 0.3, 0.28, 0.255, 0.27, 0.3, 0.28, 0.255, 0.27, 0.59, 0.64, 0.85, 0.18, 0.12, 0.18, 0.09, 0.08, 0.21, 0.305, 0.1, 0.55, 0.7, 0.75, 0.68, 0.62, 0.28, 0.6, 0.85, 0.71, 0.32, 0.58, 0.41, 0.69, 0.38, 0.89, 0.31, 0.72, 0.02, 0.28, 0.46, 0.52, 0.67, 0.95, 0.28, 0.76, 0.15, 0.38, 0.58, 0.27, 0.12, 0.59, 0.88, 0.11, 0.38, 0.67, 0.52, 0.72, 0.68, 0.91, 0.05, 0.08, 0.35, 0.51, 0.1, 0.05, 0.2, 0.35, 0.75, 0.22, 0.36, 0.12, 0.33, 0.6, 0.53, 0.73, 0.12, 0.57, 0.72, 0.86, 0.79, 0.15, 0.1, 0.32, 0.4, 0.79, 0.86, 0.73, 0.08, 0.31, 0.81, 0.88, 0.4, 0.35, 0.8, 0.72, 0.02, 0.4, 0.32, 0.53, 0.15, 0.52, 0.7, 0.37, 0.31, 0.75, 0.38, 0.55, 0.61, 0.8, 0.75, 0.19, 0.37, 0.36, 0.66, 0.72, 0.78, 0.19, 0.4, 0.37, 0.52, 0.26, 0.52, 0.64, 0.55, 0.31, 0.56, 0.6, 0.63, 0.52, 0.29, 0.18, 0.54, 0.26, 0.41, 0.33, 0.58, 0.8, 0.87, 0.51, 0.24, 0.3, 0.15, 0.35, 0.18, 0.94, 0.31, 0.2, 0.38, 0.71, 0.18, 0.33, 0.42, 0.33, 0.31, 0.32, 0.33, 0.89, 0.4, 0.8, 0.32, 0.49, 0.92, 0.22, 0.7, 0.95, 0.65, 0.14, 0.77, 0.27, 0.3, 0.53, 0.75, 0.26, 0.24, 0.01, 0.9, 0.3, 0.65, 0.8, 0.25, 0.98, 0.72, 0.41, 0.08, 0.27, 0.78, 0.76, 0.65, 0.72, 0.76, 0.78, 0.42, 0.64, 0.75, 0.48, 0.28, 0.75, 0.1, 0.44, 0.76, 0.48, 0.7, 0.41, 0.78, 0.23, 0.62, 0.77, 0.42, 0.76, 0.27, 0.4, 0.65, 0.72, 0.28, 0.63, 0.06, 0.48, 0.78, 0.27, 0.65, 0.78, 0.3, 0.12, 0.29, 0.31, 0.49, 0.29, 0.64, 0.14, 0.31, 0.51, 0.29, 0.84, 0.19, 0.19, 0.3, 0.55, 0.02, 0.29, 0.3, 0.12, 0.09, 0.29, 0.78, 0.31, 0.25, 0.29, 0.4, 0.81, 0.31, 0.61, 0.25, 0.26, 0.1, 0.31, 0.18, 0.22, 0.56, 0.09, 0.9, 0.81, 0.9, 0.8, 0.85, 0.56, 0.78, 0.78, 0.77, 0.9, 0.86, 0.82, 0.75, 0.78, 0.79, 0.8, 0.87, 0.58, 0.79, 0.7, 0.81, 0.76, 0.5, 0.66, 0.71, 0.65, 0.77, 0.86, 0.83, 0.89, 0.9, 0.85, 0.71, 0.83, 0.89, 0.93, 0.47, 0.8, 0.75, 0.68, 0.33, 0.24, 0.25, 0.3, 0.33, 0.29, 0.2, 0.25, 0.24, 0.26, 0.1, 0.05, 0.26, 0.1, 0.01, 0.28, 0.32, 0.1, 0.3, 0.28, 0.24, 0.26, 0.25, 0.28, 0.18, 0.24, 0.15, 0.19, 0.29, 0.1, 0.33, 0.3, 0.13, 0.25, 0.28, 0.14, 0.34, 0.26, 0.1, 0.3, 0.3, 0.34, 0.6, 0.66, 0.65, 0.59, 0.45, 0.6, 0.25, 0.66, 0.62, 0.45, 0.55, 0.25, 0.59, 0.56, 0.51, 0.51, 0.67, 0.58, 0.53, 0.67, 0.67, 0.56, 0.34, 0.54, 0.67, 0.59, 0.54, 0.3, 0.55, 0.45, 0.83, 0.67, 0.65, 0.5, 0.58, 0.56, 0.48, 0.64)
knowledge_train_data <- tibble(UNS=rep(c("High", "Low", "Medium"), each=40))
mtxData <- data.frame(matrix(vecData, nrow=120, byrow=FALSE))
names(mtxData) <- c("STG", "SCG", "STR", "LPR", "PEG")
knowledge_train_data <- bind_cols(as.tibble(mtxData), knowledge_train_data)
## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
glimpse(knowledge_train_data)
## Observations: 120
## Variables: 6
## $ STG <dbl> 0.080, 0.180, 0.100, 0.120, 0.090, 0.080, 0.200, 0.200, 0....
## $ SCG <dbl> 0.080, 0.180, 0.100, 0.120, 0.300, 0.325, 0.450, 0.490, 0....
## $ STR <dbl> 0.10, 0.55, 0.70, 0.75, 0.68, 0.62, 0.28, 0.60, 0.85, 0.71...
## $ LPR <dbl> 0.24, 0.30, 0.15, 0.35, 0.18, 0.94, 0.31, 0.20, 0.38, 0.71...
## $ PEG <dbl> 0.90, 0.81, 0.90, 0.80, 0.85, 0.56, 0.78, 0.78, 0.77, 0.90...
## $ UNS <chr> "High", "High", "High", "High", "High", "High", "High", "H...
library(mlr)
## Loading required package: ParamHelpers
##
## Attaching package: 'ParamHelpers'
## The following object is masked from 'package:pkgmaker':
##
## isInteger
##
## Attaching package: 'mlr'
## The following object is masked _by_ '.GlobalEnv':
##
## rmse
## The following object is masked from 'package:caret':
##
## train
## The following object is masked from 'package:e1071':
##
## impute
## The following object is masked from 'package:processmapR':
##
## performance
# Create classification taks
task <- mlr::makeClassifTask(data = knowledge_train_data, target = "UNS")
## Warning in makeTask(type = type, data = data, weights = weights, blocking
## = blocking, : Provided data is not a pure data.frame but from class tbl_df,
## hence it will be converted.
# Call the list of learners
mlr::listLearners() %>%
as.data.frame() %>%
select(class, short.name, package) %>%
filter(grepl("classif.", class))
## Warning in listLearners.character(obj = NA_character_, properties, quiet, : The following learners could not be constructed, probably because their packages are not installed:
## classif.ada,classif.bartMachine,classif.boosting,classif.bst,classif.clusterSVM,classif.dbnDNN,classif.dcSVM,classif.earth,classif.evtree,classif.extraTrees,classif.fdausc.glm,classif.fdausc.kernel,classif.fdausc.knn,classif.fdausc.np,classif.gamboost,classif.gaterSVM,classif.geoDA,classif.glmboost,classif.kknn,classif.LiblineaRL1L2SVC,classif.LiblineaRL1LogReg,classif.LiblineaRL2L1SVC,classif.LiblineaRL2LogReg,classif.LiblineaRL2SVC,classif.LiblineaRMultiClassSVC,classif.linDA,classif.liquidSVM,classif.mda,classif.mlp,classif.neuralnet,classif.nnTrain,classif.nodeHarvest,classif.pamr,classif.penalized,classif.plr,classif.quaDA,classif.randomForestSRC,classif.rFerns,classif.rknn,classif.rotationForest,classif.RRF,classif.rrlda,classif.saeDNN,classif.sda,classif.sparseLDA,cluster.cmeans,cluster.kmeans,multilabel.randomForestSRC,multilabel.rFerns,regr.bartMachine,regr.bcart,regr.bgp,regr.bgpllm,regr.blm,regr.brnn,regr.bst,regr.btgp,regr.btgpllm,regr.btlm,regr.crs,regr.earth,regr.evtree,regr.extraTrees,regr.FDboost,regr.frbs,regr.gamboost,regr.glmboost,regr.GPfit,regr.kknn,regr.km,regr.laGP,regr.LiblineaRL2L1SVR,regr.LiblineaRL2L2SVR,regr.liquidSVM,regr.mars,regr.nodeHarvest,regr.penalized,regr.randomForestSRC,regr.rknn,regr.RRF,regr.rsm,regr.slim,surv.CoxBoost,surv.cv.CoxBoost,surv.gamboost,surv.glmboost,surv.randomForestSRC
## Check ?learners to see which packages you need or install mlr with all suggestions.
## class short.name
## 1 classif.ada ada
## 2 classif.adaboostm1 adaboostm1
## 3 classif.bartMachine bartmachine
## 4 classif.binomial binomial
## 5 classif.boosting adabag
## 6 classif.bst bst
## 7 classif.C50 C50
## 8 classif.cforest cforest
## 9 classif.clusterSVM clusterSVM
## 10 classif.ctree ctree
## 11 classif.cvglmnet cvglmnet
## 12 classif.dbnDNN dbn.dnn
## 13 classif.dcSVM dcSVM
## 14 classif.earth fda
## 15 classif.evtree evtree
## 16 classif.extraTrees extraTrees
## 17 classif.fdausc.glm fdausc.glm
## 18 classif.fdausc.kernel fdausc.kernel
## 19 classif.fdausc.knn fdausc.knn
## 20 classif.fdausc.np fdausc.np
## 21 classif.featureless featureless
## 22 classif.fnn fnn
## 23 classif.gamboost gamboost
## 24 classif.gaterSVM gaterSVM
## 25 classif.gausspr gausspr
## 26 classif.gbm gbm
## 27 classif.geoDA geoda
## 28 classif.glmboost glmboost
## 29 classif.glmnet glmnet
## 30 classif.h2o.deeplearning h2o.dl
## 31 classif.h2o.gbm h2o.gbm
## 32 classif.h2o.glm h2o.glm
## 33 classif.h2o.randomForest h2o.rf
## 34 classif.IBk ibk
## 35 classif.J48 j48
## 36 classif.JRip jrip
## 37 classif.kknn kknn
## 38 classif.knn knn
## 39 classif.ksvm ksvm
## 40 classif.lda lda
## 41 classif.LiblineaRL1L2SVC liblinl1l2svc
## 42 classif.LiblineaRL1LogReg liblinl1logreg
## 43 classif.LiblineaRL2L1SVC liblinl2l1svc
## 44 classif.LiblineaRL2LogReg liblinl2logreg
## 45 classif.LiblineaRL2SVC liblinl2svc
## 46 classif.LiblineaRMultiClassSVC liblinmulticlasssvc
## 47 classif.linDA linda
## 48 classif.liquidSVM liquidSVM
## 49 classif.logreg logreg
## 50 classif.lssvm lssvm
## 51 classif.lvq1 lvq1
## 52 classif.mda mda
## 53 classif.mlp mlp
## 54 classif.multinom multinom
## 55 classif.naiveBayes nbayes
## 56 classif.neuralnet neuralnet
## 57 classif.nnet nnet
## 58 classif.nnTrain nn.train
## 59 classif.nodeHarvest nodeHarvest
## 60 classif.OneR oner
## 61 classif.pamr pamr
## 62 classif.PART part
## 63 classif.penalized penalized
## 64 classif.plr plr
## 65 classif.plsdaCaret plsdacaret
## 66 classif.probit probit
## 67 classif.qda qda
## 68 classif.quaDA quada
## 69 classif.randomForest rf
## 70 classif.randomForestSRC rfsrc
## 71 classif.ranger ranger
## 72 classif.rda rda
## 73 classif.rFerns rFerns
## 74 classif.rknn rknn
## 75 classif.rotationForest rotationForest
## 76 classif.rpart rpart
## 77 classif.RRF RRF
## 78 classif.rrlda rrlda
## 79 classif.saeDNN sae.dnn
## 80 classif.sda sda
## 81 classif.sparseLDA sparseLDA
## 82 classif.svm svm
## 83 classif.xgboost xgboost
## package
## 1 ada,rpart
## 2 RWeka
## 3 bartMachine
## 4 stats
## 5 adabag,rpart
## 6 bst,rpart
## 7 C50
## 8 party
## 9 SwarmSVM,LiblineaR
## 10 party
## 11 glmnet
## 12 deepnet
## 13 SwarmSVM,e1071
## 14 earth,stats
## 15 evtree
## 16 extraTrees
## 17 fda.usc
## 18 fda.usc
## 19 fda.usc
## 20 fda.usc
## 21 mlr
## 22 FNN
## 23 mboost
## 24 SwarmSVM
## 25 kernlab
## 26 gbm
## 27 DiscriMiner
## 28 mboost
## 29 glmnet
## 30 h2o
## 31 h2o
## 32 h2o
## 33 h2o
## 34 RWeka
## 35 RWeka
## 36 RWeka
## 37 kknn
## 38 class
## 39 kernlab
## 40 MASS
## 41 LiblineaR
## 42 LiblineaR
## 43 LiblineaR
## 44 LiblineaR
## 45 LiblineaR
## 46 LiblineaR
## 47 DiscriMiner
## 48 liquidSVM
## 49 stats
## 50 kernlab
## 51 class
## 52 mda
## 53 RSNNS
## 54 nnet
## 55 e1071
## 56 neuralnet
## 57 nnet
## 58 deepnet
## 59 nodeHarvest
## 60 RWeka
## 61 pamr
## 62 RWeka
## 63 penalized
## 64 stepPlr
## 65 caret,pls
## 66 stats
## 67 MASS
## 68 DiscriMiner
## 69 randomForest
## 70 randomForestSRC
## 71 ranger
## 72 klaR
## 73 rFerns
## 74 rknn
## 75 rotationForest
## 76 rpart
## 77 RRF
## 78 rrlda
## 79 deepnet
## 80 sda
## 81 sparseLDA,MASS,elasticnet
## 82 e1071
## 83 xgboost
# Create learner
lrn <- mlr::makeLearner("classif.randomForest", predict.type = "prob", fix.factors.prediction = TRUE)
# Get the parameter set for neural networks of the nnet package
ParamHelpers::getParamSet("classif.nnet")
## Type len Def Constr Req Tunable Trafo
## size integer - 3 0 to Inf - TRUE -
## maxit integer - 100 1 to Inf - TRUE -
## skip logical - FALSE - - TRUE -
## rang numeric - 0.7 -Inf to Inf - TRUE -
## decay numeric - 0 -Inf to Inf - TRUE -
## Hess logical - FALSE - - TRUE -
## trace logical - TRUE - - FALSE -
## MaxNWts integer - 1000 1 to Inf - FALSE -
## abstol numeric - 0.0001 -Inf to Inf - TRUE -
## reltol numeric - 1e-08 -Inf to Inf - TRUE -
# Define set of parameters
param_set <- ParamHelpers::makeParamSet(ParamHelpers::makeDiscreteParam("size", values = c(2,3,5)),
ParamHelpers::makeNumericParam("decay", lower = 0.0001, upper = 0.1)
)
# Print parameter set
print(param_set)
## Type len Def Constr Req Tunable Trafo
## size discrete - - 2,3,5 - TRUE -
## decay numeric - - 0.0001 to 0.1 - TRUE -
# Define a random search tuning method.
ctrl_random <- mlr::makeTuneControlRandom()
# Define task
task <- makeClassifTask(data = knowledge_train_data, target = "UNS")
## Warning in makeTask(type = type, data = data, weights = weights, blocking
## = blocking, : Provided data is not a pure data.frame but from class tbl_df,
## hence it will be converted.
# Define learner
lrn <- makeLearner("classif.nnet", predict.type = "prob", fix.factors.prediction = TRUE)
# Define set of parameters
param_set <- makeParamSet(makeDiscreteParam("size", values = c(2,3,5)),
makeNumericParam("decay", lower = 0.0001, upper = 0.1)
)
# Define a random search tuning method.
ctrl_random <- mlr::makeTuneControlRandom(maxit = 6)
# Define a 2 x 2 repeated cross-validation scheme
cross_val <- mlr::makeResampleDesc("RepCV", folds = 2 * 2)
# Tune hyperparameters
tictoc::tic()
lrn_tune <- mlr::tuneParams(lrn, task, resampling = cross_val, control = ctrl_random, par.set=param_set)
## [Tune] Started tuning learner classif.nnet for parameter set:
## Type len Def Constr Req Tunable Trafo
## size discrete - - 2,3,5 - TRUE -
## decay numeric - - 0.0001 to 0.1 - TRUE -
## With control class: TuneControlRandom
## Imputation value: 1
## [Tune-x] 1: size=2; decay=0.00779
## # weights: 21
## initial value 112.737276
## iter 10 value 59.082328
## iter 20 value 19.787952
## iter 30 value 18.064517
## iter 40 value 17.952929
## iter 50 value 17.884020
## iter 60 value 17.869365
## iter 70 value 17.868531
## final value 17.868519
## converged
## # weights: 21
## initial value 102.199839
## iter 10 value 41.476402
## iter 20 value 21.007165
## iter 30 value 19.846955
## iter 40 value 19.077249
## iter 50 value 17.893235
## iter 60 value 16.894414
## iter 70 value 16.745894
## iter 80 value 16.738340
## iter 90 value 16.737157
## final value 16.737150
## converged
## # weights: 21
## initial value 99.375959
## iter 10 value 45.481884
## iter 20 value 11.303700
## iter 30 value 9.236685
## iter 40 value 9.102399
## iter 50 value 9.089763
## iter 60 value 9.080980
## iter 70 value 9.080516
## iter 80 value 9.080510
## final value 9.080510
## converged
## # weights: 21
## initial value 101.231658
## iter 10 value 27.887510
## iter 20 value 18.436825
## iter 30 value 18.173475
## iter 40 value 16.989967
## iter 50 value 15.392260
## iter 60 value 15.152620
## iter 70 value 15.141648
## iter 80 value 15.141448
## final value 15.141443
## converged
## # weights: 21
## initial value 103.763017
## iter 10 value 28.153080
## iter 20 value 13.128587
## iter 30 value 12.353443
## iter 40 value 12.196811
## iter 50 value 12.188780
## iter 60 value 12.162821
## iter 70 value 12.161957
## final value 12.161951
## converged
## # weights: 21
## initial value 109.317314
## iter 10 value 22.497005
## iter 20 value 19.218313
## iter 30 value 17.559913
## iter 40 value 17.294557
## iter 50 value 17.194035
## iter 60 value 17.123278
## iter 70 value 17.122103
## final value 17.122094
## converged
## # weights: 21
## initial value 118.090824
## iter 10 value 64.880621
## iter 20 value 17.665442
## iter 30 value 16.892673
## iter 40 value 16.307504
## iter 50 value 15.406085
## iter 60 value 14.466792
## iter 70 value 14.191129
## iter 80 value 14.177345
## iter 90 value 14.175877
## iter 100 value 14.175815
## final value 14.175815
## stopped after 100 iterations
## # weights: 21
## initial value 98.828661
## iter 10 value 22.407032
## iter 20 value 17.481025
## iter 30 value 16.857187
## iter 40 value 16.074096
## iter 50 value 15.692011
## iter 60 value 15.429560
## iter 70 value 15.418977
## iter 80 value 15.418827
## final value 15.418817
## converged
## # weights: 21
## initial value 101.619277
## iter 10 value 38.163020
## iter 20 value 22.213117
## iter 30 value 20.822530
## iter 40 value 19.488851
## iter 50 value 18.018561
## iter 60 value 17.798223
## iter 70 value 17.773771
## iter 80 value 17.773581
## final value 17.773574
## converged
## # weights: 21
## initial value 100.731946
## iter 10 value 38.267933
## iter 20 value 19.672372
## iter 30 value 18.911917
## iter 40 value 18.558480
## iter 50 value 17.258065
## iter 60 value 16.861026
## iter 70 value 16.827239
## iter 80 value 16.826900
## final value 16.826876
## converged
## # weights: 21
## initial value 115.768265
## iter 10 value 88.665348
## iter 20 value 26.212407
## iter 30 value 17.600437
## iter 40 value 16.925272
## iter 50 value 16.100945
## iter 60 value 14.375765
## iter 70 value 12.751145
## iter 80 value 12.584634
## iter 90 value 12.577752
## iter 100 value 12.577195
## final value 12.577195
## stopped after 100 iterations
## # weights: 21
## initial value 112.265356
## iter 10 value 45.306133
## iter 20 value 13.879122
## iter 30 value 12.666204
## iter 40 value 12.333084
## iter 50 value 12.243346
## iter 60 value 12.191870
## iter 70 value 12.189015
## final value 12.189010
## converged
## # weights: 21
## initial value 100.973896
## iter 10 value 60.966338
## iter 20 value 19.806172
## iter 30 value 18.313350
## iter 40 value 18.128526
## iter 50 value 18.071983
## iter 60 value 18.033742
## iter 70 value 18.033178
## final value 18.033164
## converged
## # weights: 21
## initial value 101.175728
## iter 10 value 53.762590
## iter 20 value 17.422167
## iter 30 value 16.043078
## iter 40 value 14.997529
## iter 50 value 14.821277
## iter 60 value 14.720612
## iter 70 value 14.712664
## iter 80 value 14.712381
## final value 14.712378
## converged
## # weights: 21
## initial value 107.713607
## iter 10 value 69.080911
## iter 20 value 38.532498
## iter 30 value 25.196611
## iter 40 value 20.088302
## iter 50 value 17.698423
## iter 60 value 16.998775
## iter 70 value 16.051191
## iter 80 value 13.306311
## iter 90 value 12.372010
## iter 100 value 12.186085
## final value 12.186085
## stopped after 100 iterations
## # weights: 21
## initial value 122.894748
## iter 10 value 55.959344
## iter 20 value 15.287731
## iter 30 value 14.405829
## iter 40 value 14.031984
## iter 50 value 13.877002
## iter 60 value 13.855511
## iter 70 value 13.853268
## iter 80 value 13.853077
## final value 13.853066
## converged
## # weights: 21
## initial value 102.781029
## iter 10 value 39.478082
## iter 20 value 21.367085
## iter 30 value 20.140264
## iter 40 value 18.312745
## iter 50 value 17.867227
## iter 60 value 16.344864
## iter 70 value 15.556351
## iter 80 value 15.496841
## iter 90 value 15.491891
## iter 100 value 15.491743
## final value 15.491743
## stopped after 100 iterations
## # weights: 21
## initial value 117.702933
## iter 10 value 89.924185
## iter 20 value 28.816578
## iter 30 value 19.060347
## iter 40 value 18.130611
## iter 50 value 17.770208
## iter 60 value 17.681002
## iter 70 value 17.679080
## iter 80 value 17.679030
## final value 17.679029
## converged
## # weights: 21
## initial value 114.016663
## iter 10 value 27.475688
## iter 20 value 16.855568
## iter 30 value 14.459688
## iter 40 value 12.888182
## iter 50 value 11.644679
## iter 60 value 11.445156
## iter 70 value 11.431491
## iter 80 value 11.431331
## final value 11.431329
## converged
## # weights: 21
## initial value 99.462865
## iter 10 value 34.295267
## iter 20 value 19.197133
## iter 30 value 17.189905
## iter 40 value 16.182226
## iter 50 value 16.105666
## iter 60 value 16.082446
## iter 70 value 16.080713
## final value 16.080702
## converged
## # weights: 21
## initial value 103.887812
## iter 10 value 49.227809
## iter 20 value 17.457461
## iter 30 value 12.595677
## iter 40 value 11.770740
## iter 50 value 9.766838
## iter 60 value 9.070627
## iter 70 value 8.955469
## iter 80 value 8.953074
## final value 8.953023
## converged
## # weights: 21
## initial value 104.712726
## iter 10 value 62.014017
## iter 20 value 21.311872
## iter 30 value 19.734365
## iter 40 value 18.682411
## iter 50 value 18.540703
## iter 60 value 17.859976
## iter 70 value 16.105503
## iter 80 value 15.820356
## iter 90 value 15.806332
## iter 100 value 15.801633
## final value 15.801633
## stopped after 100 iterations
## # weights: 21
## initial value 100.548011
## iter 10 value 51.276972
## iter 20 value 21.098265
## iter 30 value 19.751831
## iter 40 value 18.733961
## iter 50 value 18.450500
## iter 60 value 18.411604
## iter 70 value 18.408414
## final value 18.408391
## converged
## # weights: 21
## initial value 104.288933
## iter 10 value 39.072145
## iter 20 value 20.281542
## iter 30 value 19.458310
## iter 40 value 18.223241
## iter 50 value 17.624455
## iter 60 value 17.355498
## iter 70 value 17.328431
## iter 80 value 17.328044
## final value 17.328036
## converged
## # weights: 21
## initial value 114.214241
## iter 10 value 80.869627
## iter 20 value 15.349282
## iter 30 value 14.213546
## iter 40 value 14.194575
## iter 50 value 14.181571
## iter 60 value 14.177726
## iter 70 value 14.176879
## iter 80 value 14.176837
## final value 14.176837
## converged
## # weights: 21
## initial value 102.809099
## iter 10 value 77.921095
## iter 20 value 19.882880
## iter 30 value 15.748019
## iter 40 value 14.738512
## iter 50 value 13.531410
## iter 60 value 13.341704
## iter 70 value 13.241870
## iter 80 value 13.237638
## final value 13.237515
## converged
## # weights: 21
## initial value 111.720668
## iter 10 value 33.667679
## iter 20 value 23.759344
## iter 30 value 20.718672
## iter 40 value 18.723184
## iter 50 value 18.209350
## iter 60 value 18.037709
## iter 70 value 18.029824
## iter 80 value 18.029689
## final value 18.029683
## converged
## # weights: 21
## initial value 104.334473
## iter 10 value 31.636296
## iter 20 value 15.603153
## iter 30 value 14.847099
## iter 40 value 14.251179
## iter 50 value 14.056077
## iter 60 value 13.974030
## iter 70 value 13.968621
## final value 13.968617
## converged
## # weights: 21
## initial value 102.594081
## iter 10 value 27.031048
## iter 20 value 19.968774
## iter 30 value 19.160768
## iter 40 value 18.315719
## iter 50 value 17.962175
## iter 60 value 17.780765
## iter 70 value 17.776503
## iter 80 value 17.776468
## final value 17.776466
## converged
## # weights: 21
## initial value 106.070284
## iter 10 value 48.870288
## iter 20 value 18.453964
## iter 30 value 17.291129
## iter 40 value 14.692545
## iter 50 value 13.963406
## iter 60 value 13.875929
## iter 70 value 13.870289
## iter 80 value 13.869936
## final value 13.869925
## converged
## # weights: 21
## initial value 105.045744
## iter 10 value 47.772935
## iter 20 value 22.228715
## iter 30 value 18.950780
## iter 40 value 18.304071
## iter 50 value 17.820194
## iter 60 value 17.534296
## iter 70 value 17.521210
## iter 80 value 17.520818
## final value 17.520806
## converged
## # weights: 21
## initial value 104.213667
## iter 10 value 34.144032
## iter 20 value 17.164156
## iter 30 value 11.656114
## iter 40 value 10.411400
## iter 50 value 10.161250
## iter 60 value 10.145989
## iter 70 value 10.145053
## iter 80 value 10.144940
## final value 10.144933
## converged
## # weights: 21
## initial value 98.281324
## iter 10 value 28.888058
## iter 20 value 21.213893
## iter 30 value 19.672023
## iter 40 value 18.740467
## iter 50 value 18.530935
## iter 60 value 18.361547
## iter 70 value 18.359102
## final value 18.359062
## converged
## # weights: 21
## initial value 119.356596
## iter 10 value 87.287341
## iter 20 value 23.033563
## iter 30 value 16.892631
## iter 40 value 16.411949
## iter 50 value 16.121386
## iter 60 value 15.951808
## iter 70 value 15.935035
## iter 80 value 15.934605
## final value 15.934601
## converged
## # weights: 21
## initial value 99.650533
## iter 10 value 47.049033
## iter 20 value 20.939465
## iter 30 value 17.599154
## iter 40 value 16.841298
## iter 50 value 15.360520
## iter 60 value 13.305444
## iter 70 value 12.017469
## iter 80 value 11.830264
## iter 90 value 11.708991
## iter 100 value 11.706945
## final value 11.706945
## stopped after 100 iterations
## # weights: 21
## initial value 100.142880
## iter 10 value 45.962487
## iter 20 value 16.054916
## iter 30 value 14.138142
## iter 40 value 13.913566
## iter 50 value 13.732967
## iter 60 value 13.658799
## iter 70 value 13.656558
## iter 80 value 13.656445
## final value 13.656444
## converged
## # weights: 21
## initial value 101.093191
## iter 10 value 41.149834
## iter 20 value 13.646982
## iter 30 value 12.901699
## iter 40 value 12.657001
## iter 50 value 12.294650
## iter 60 value 12.249010
## iter 70 value 12.242734
## iter 80 value 12.242566
## final value 12.242553
## converged
## # weights: 21
## initial value 111.428036
## iter 10 value 72.939356
## iter 20 value 19.995742
## iter 30 value 17.804604
## iter 40 value 16.462203
## iter 50 value 15.810207
## iter 60 value 15.575160
## iter 70 value 15.535525
## iter 80 value 15.532111
## iter 90 value 15.532101
## final value 15.532101
## converged
## # weights: 21
## initial value 106.911638
## iter 10 value 43.866986
## iter 20 value 20.423534
## iter 30 value 18.582141
## iter 40 value 17.658185
## iter 50 value 16.422369
## iter 60 value 15.745003
## iter 70 value 15.161795
## iter 80 value 15.114947
## iter 90 value 15.114449
## final value 15.114418
## converged
## # weights: 21
## initial value 112.426143
## iter 10 value 37.801097
## iter 20 value 18.500269
## iter 30 value 17.785433
## iter 40 value 17.507623
## iter 50 value 17.404310
## iter 60 value 17.396385
## iter 70 value 17.395876
## final value 17.395868
## converged
## [Tune-y] 1: mmce.test.mean=0.0683333; time: 0.0 min
## [Tune-x] 2: size=2; decay=0.0999
## # weights: 21
## initial value 105.511103
## iter 10 value 55.189299
## iter 20 value 50.721806
## iter 30 value 47.445012
## iter 40 value 47.354530
## iter 50 value 47.347136
## final value 47.347133
## converged
## # weights: 21
## initial value 107.010000
## iter 10 value 62.513355
## iter 20 value 46.370487
## iter 30 value 46.323294
## final value 46.322992
## converged
## # weights: 21
## initial value 102.478866
## iter 10 value 51.054032
## iter 20 value 44.793396
## iter 30 value 44.513571
## iter 40 value 44.511986
## final value 44.511968
## converged
## # weights: 21
## initial value 102.768070
## iter 10 value 56.874262
## iter 20 value 47.306034
## iter 30 value 47.212600
## final value 47.212549
## converged
## # weights: 21
## initial value 101.669360
## iter 10 value 52.824498
## iter 20 value 46.249418
## iter 30 value 45.856901
## final value 45.856886
## converged
## # weights: 21
## initial value 99.132985
## iter 10 value 60.799939
## iter 20 value 51.646099
## iter 30 value 50.636378
## iter 40 value 49.985909
## iter 50 value 49.954302
## final value 49.954255
## converged
## # weights: 21
## initial value 100.639040
## iter 10 value 65.272319
## iter 20 value 50.463102
## iter 30 value 49.087443
## iter 40 value 48.995773
## final value 48.995657
## converged
## # weights: 21
## initial value 102.853680
## iter 10 value 58.038007
## iter 20 value 52.862041
## iter 30 value 48.523721
## iter 40 value 47.191848
## iter 50 value 47.182403
## final value 47.182310
## converged
## # weights: 21
## initial value 121.952585
## iter 10 value 83.197399
## iter 20 value 48.893173
## iter 30 value 48.517445
## iter 40 value 48.472165
## iter 50 value 48.464916
## final value 48.464844
## converged
## # weights: 21
## initial value 106.799670
## iter 10 value 73.204499
## iter 20 value 51.755842
## iter 30 value 50.368388
## iter 40 value 50.348957
## iter 40 value 50.348957
## iter 40 value 50.348957
## final value 50.348957
## converged
## # weights: 21
## initial value 100.347623
## iter 10 value 68.629781
## iter 20 value 47.915084
## iter 30 value 47.339885
## iter 40 value 47.328921
## iter 40 value 47.328921
## iter 40 value 47.328921
## final value 47.328921
## converged
## # weights: 21
## initial value 107.063314
## iter 10 value 73.243069
## iter 20 value 46.185791
## iter 30 value 45.424616
## iter 40 value 45.384692
## final value 45.384580
## converged
## # weights: 21
## initial value 101.737198
## iter 10 value 57.706092
## iter 20 value 51.428916
## iter 30 value 48.336822
## iter 40 value 48.225455
## final value 48.225442
## converged
## # weights: 21
## initial value 106.204221
## iter 10 value 61.477756
## iter 20 value 47.907891
## iter 30 value 46.888763
## iter 40 value 46.731646
## final value 46.706424
## converged
## # weights: 21
## initial value 98.844757
## iter 10 value 49.393973
## iter 20 value 45.133908
## iter 30 value 45.068202
## final value 45.068129
## converged
## # weights: 21
## initial value 99.976960
## iter 10 value 79.206562
## iter 20 value 57.510071
## iter 30 value 48.809920
## iter 40 value 46.413309
## iter 50 value 46.321252
## iter 60 value 46.318303
## iter 60 value 46.318303
## iter 60 value 46.318303
## final value 46.318303
## converged
## # weights: 21
## initial value 100.494999
## iter 10 value 60.541286
## iter 20 value 47.151665
## iter 30 value 47.091514
## final value 47.089953
## converged
## # weights: 21
## initial value 101.427179
## iter 10 value 62.515919
## iter 20 value 50.496714
## iter 30 value 47.809597
## iter 40 value 47.771838
## final value 47.771835
## converged
## # weights: 21
## initial value 107.465270
## iter 10 value 60.666569
## iter 20 value 45.507187
## iter 30 value 45.351946
## final value 45.350380
## converged
## # weights: 21
## initial value 103.132640
## iter 10 value 53.045474
## iter 20 value 47.358081
## iter 30 value 47.216070
## final value 47.216015
## converged
## # weights: 21
## initial value 103.156596
## iter 10 value 48.487910
## iter 20 value 44.055743
## iter 30 value 43.900406
## iter 40 value 43.899942
## iter 40 value 43.899941
## iter 40 value 43.899941
## final value 43.899941
## converged
## # weights: 21
## initial value 108.757009
## iter 10 value 63.882752
## iter 20 value 48.699881
## iter 30 value 45.801913
## iter 40 value 45.686879
## final value 45.686587
## converged
## # weights: 21
## initial value 99.077609
## iter 10 value 72.042285
## iter 20 value 52.758616
## iter 30 value 49.163800
## iter 40 value 49.129994
## iter 50 value 49.129611
## final value 49.129470
## converged
## # weights: 21
## initial value 105.026079
## iter 10 value 55.731922
## iter 20 value 48.920381
## iter 30 value 48.777880
## iter 40 value 48.774723
## iter 40 value 48.774722
## iter 40 value 48.774722
## final value 48.774722
## converged
## # weights: 21
## initial value 99.847288
## iter 10 value 69.883903
## iter 20 value 50.663993
## iter 30 value 50.448296
## iter 40 value 50.431366
## final value 50.431344
## converged
## # weights: 21
## initial value 103.520291
## iter 10 value 68.141953
## iter 20 value 51.550142
## iter 30 value 49.951378
## iter 40 value 49.827067
## final value 49.827048
## converged
## # weights: 21
## initial value 100.851900
## iter 10 value 74.062044
## iter 20 value 57.238902
## iter 30 value 52.915850
## iter 40 value 48.916691
## iter 50 value 48.062901
## iter 60 value 47.900354
## final value 47.900271
## converged
## # weights: 21
## initial value 107.947121
## iter 10 value 67.369191
## iter 20 value 50.500349
## iter 30 value 47.034081
## iter 40 value 46.095383
## iter 50 value 45.837554
## iter 60 value 45.832909
## final value 45.832908
## converged
## # weights: 21
## initial value 99.192064
## iter 10 value 56.736631
## iter 20 value 48.817628
## iter 30 value 48.166327
## iter 40 value 48.164702
## iter 40 value 48.164702
## iter 40 value 48.164702
## final value 48.164702
## converged
## # weights: 21
## initial value 99.197385
## iter 10 value 59.613864
## iter 20 value 50.812147
## iter 30 value 50.109840
## final value 50.107701
## converged
## # weights: 21
## initial value 105.216863
## iter 10 value 52.879794
## iter 20 value 48.284889
## iter 30 value 46.919335
## iter 40 value 46.840500
## final value 46.840497
## converged
## # weights: 21
## initial value 102.193454
## iter 10 value 75.281421
## iter 20 value 47.628114
## iter 30 value 45.559107
## iter 40 value 44.868556
## iter 50 value 44.854066
## final value 44.854015
## converged
## # weights: 21
## initial value 104.641849
## iter 10 value 70.109824
## iter 20 value 51.460422
## iter 30 value 51.065798
## iter 40 value 51.044538
## final value 51.044521
## converged
## # weights: 21
## initial value 110.729356
## iter 10 value 80.163077
## iter 20 value 58.542241
## iter 30 value 51.182958
## iter 40 value 47.817081
## iter 50 value 47.016360
## iter 60 value 46.972613
## final value 46.972612
## converged
## # weights: 21
## initial value 107.155933
## iter 10 value 62.650205
## iter 20 value 51.106241
## iter 30 value 46.464632
## iter 40 value 46.215682
## iter 50 value 46.212774
## iter 50 value 46.212773
## iter 50 value 46.212773
## final value 46.212773
## converged
## # weights: 21
## initial value 107.574280
## iter 10 value 75.118994
## iter 20 value 53.866165
## iter 30 value 50.154094
## iter 40 value 49.514868
## iter 50 value 49.450284
## final value 49.450230
## converged
## # weights: 21
## initial value 108.971320
## iter 10 value 67.147973
## iter 20 value 45.904437
## iter 30 value 45.422006
## iter 40 value 45.358473
## final value 45.358461
## converged
## # weights: 21
## initial value 99.846107
## iter 10 value 69.061050
## iter 20 value 51.125843
## iter 30 value 49.483493
## iter 40 value 49.199557
## iter 40 value 49.199557
## iter 40 value 49.199557
## final value 49.199557
## converged
## # weights: 21
## initial value 98.736172
## iter 10 value 64.017420
## iter 20 value 55.999660
## iter 30 value 48.850349
## iter 40 value 48.274861
## iter 50 value 48.247935
## final value 48.247279
## converged
## # weights: 21
## initial value 113.764781
## iter 10 value 81.349232
## iter 20 value 53.780063
## iter 30 value 51.893226
## iter 40 value 51.192481
## iter 50 value 51.124995
## final value 51.122976
## converged
## [Tune-y] 2: mmce.test.mean=0.0558333; time: 0.0 min
## [Tune-x] 3: size=3; decay=0.0503
## # weights: 30
## initial value 104.045824
## iter 10 value 89.216279
## iter 20 value 44.273581
## iter 30 value 35.896994
## iter 40 value 35.721667
## iter 50 value 35.031246
## iter 60 value 34.802736
## iter 70 value 34.697341
## iter 80 value 34.646863
## iter 90 value 34.638313
## iter 100 value 34.637681
## final value 34.637681
## stopped after 100 iterations
## # weights: 30
## initial value 126.091482
## iter 10 value 60.068302
## iter 20 value 37.272028
## iter 30 value 35.752393
## iter 40 value 33.810930
## iter 50 value 33.360978
## iter 60 value 33.276964
## iter 70 value 33.218350
## iter 80 value 33.212723
## final value 33.212553
## converged
## # weights: 30
## initial value 105.728533
## iter 10 value 52.676896
## iter 20 value 32.463078
## iter 30 value 30.861966
## iter 40 value 30.074642
## iter 50 value 29.757499
## iter 60 value 29.717727
## iter 70 value 29.717277
## final value 29.717262
## converged
## # weights: 30
## initial value 111.883618
## iter 10 value 69.245716
## iter 20 value 35.246621
## iter 30 value 33.886549
## iter 40 value 33.646168
## iter 50 value 33.505204
## iter 60 value 33.484457
## iter 70 value 33.479709
## iter 80 value 33.479337
## final value 33.479336
## converged
## # weights: 30
## initial value 101.715178
## iter 10 value 40.551187
## iter 20 value 32.620583
## iter 30 value 32.093706
## iter 40 value 31.721609
## iter 50 value 31.689953
## iter 60 value 31.670001
## final value 31.669951
## converged
## # weights: 30
## initial value 101.711293
## iter 10 value 46.934490
## iter 20 value 35.589842
## iter 30 value 34.844398
## iter 40 value 34.509847
## iter 50 value 34.441780
## iter 60 value 34.369959
## iter 70 value 34.202666
## iter 80 value 33.861411
## iter 90 value 33.844532
## final value 33.844297
## converged
## # weights: 30
## initial value 107.281428
## iter 10 value 51.418402
## iter 20 value 33.522796
## iter 30 value 32.686601
## iter 40 value 32.455238
## iter 50 value 32.442006
## iter 60 value 32.406639
## iter 70 value 32.402306
## iter 80 value 32.402229
## final value 32.402224
## converged
## # weights: 30
## initial value 100.524304
## iter 10 value 55.808354
## iter 20 value 37.102935
## iter 30 value 34.930836
## iter 40 value 33.949684
## iter 50 value 33.666228
## iter 60 value 33.601186
## iter 70 value 33.575276
## iter 80 value 33.571751
## iter 90 value 33.571608
## iter 90 value 33.571608
## iter 90 value 33.571608
## final value 33.571608
## converged
## # weights: 30
## initial value 101.887324
## iter 10 value 54.046076
## iter 20 value 41.282893
## iter 30 value 36.234298
## iter 40 value 35.439521
## iter 50 value 35.309915
## iter 60 value 35.298051
## iter 70 value 35.297355
## iter 80 value 35.297292
## final value 35.297291
## converged
## # weights: 30
## initial value 105.321864
## iter 10 value 56.143846
## iter 20 value 36.491622
## iter 30 value 34.465435
## iter 40 value 34.011924
## iter 50 value 33.985391
## iter 60 value 33.980058
## iter 70 value 33.975577
## iter 80 value 33.973318
## final value 33.973311
## converged
## # weights: 30
## initial value 116.664737
## iter 10 value 92.534745
## iter 20 value 41.198301
## iter 30 value 33.941819
## iter 40 value 32.982966
## iter 50 value 31.831567
## iter 60 value 30.716516
## iter 70 value 30.667231
## iter 80 value 30.633250
## iter 90 value 30.629251
## final value 30.629229
## converged
## # weights: 30
## initial value 104.728010
## iter 10 value 50.357290
## iter 20 value 33.027284
## iter 30 value 32.245762
## iter 40 value 31.543562
## iter 50 value 31.384294
## iter 60 value 31.353650
## iter 70 value 31.352809
## iter 80 value 31.348333
## iter 90 value 31.316799
## iter 100 value 31.285401
## final value 31.285401
## stopped after 100 iterations
## # weights: 30
## initial value 125.927582
## iter 10 value 51.373456
## iter 20 value 36.081778
## iter 30 value 35.536192
## iter 40 value 35.260272
## iter 50 value 35.154762
## iter 60 value 35.120373
## iter 70 value 35.104741
## iter 80 value 35.103340
## final value 35.103310
## converged
## # weights: 30
## initial value 100.920969
## iter 10 value 37.793392
## iter 20 value 33.415857
## iter 30 value 32.828423
## iter 40 value 32.518932
## iter 50 value 32.476236
## iter 60 value 32.470227
## iter 70 value 32.467205
## iter 80 value 32.466698
## final value 32.466689
## converged
## # weights: 30
## initial value 104.120202
## iter 10 value 40.512773
## iter 20 value 33.585053
## iter 30 value 32.113128
## iter 40 value 31.553206
## iter 50 value 31.341912
## iter 60 value 31.338100
## iter 70 value 31.335197
## iter 80 value 31.334994
## final value 31.334982
## converged
## # weights: 30
## initial value 117.702334
## iter 10 value 94.433265
## iter 20 value 39.313621
## iter 30 value 33.420217
## iter 40 value 32.549640
## iter 50 value 32.547177
## iter 60 value 32.546934
## iter 70 value 32.543511
## iter 80 value 32.533457
## iter 90 value 32.507902
## iter 100 value 32.501787
## final value 32.501787
## stopped after 100 iterations
## # weights: 30
## initial value 113.329304
## iter 10 value 47.751532
## iter 20 value 33.355733
## iter 30 value 32.921993
## iter 40 value 32.914024
## iter 50 value 32.911425
## iter 60 value 32.910534
## iter 70 value 32.909937
## iter 80 value 32.909779
## final value 32.909775
## converged
## # weights: 30
## initial value 106.842180
## iter 10 value 64.438227
## iter 20 value 35.574954
## iter 30 value 34.968275
## iter 40 value 34.781675
## iter 50 value 34.746398
## iter 60 value 34.654962
## iter 70 value 34.618494
## iter 80 value 34.610445
## iter 90 value 34.610194
## final value 34.610192
## converged
## # weights: 30
## initial value 98.847924
## iter 10 value 38.560949
## iter 20 value 32.125703
## iter 30 value 31.559143
## iter 40 value 31.278467
## iter 50 value 31.228005
## iter 60 value 31.225191
## iter 70 value 31.221440
## iter 80 value 31.219950
## iter 90 value 31.219900
## iter 90 value 31.219900
## iter 90 value 31.219900
## final value 31.219900
## converged
## # weights: 30
## initial value 106.355014
## iter 10 value 48.069203
## iter 20 value 38.499356
## iter 30 value 36.399479
## iter 40 value 36.085261
## iter 50 value 36.057782
## iter 60 value 35.342082
## iter 70 value 33.734401
## iter 80 value 33.469685
## iter 90 value 33.467871
## iter 100 value 33.467587
## final value 33.467587
## stopped after 100 iterations
## # weights: 30
## initial value 103.526395
## iter 10 value 47.528053
## iter 20 value 32.043913
## iter 30 value 29.642669
## iter 40 value 29.372169
## iter 50 value 29.268549
## iter 60 value 29.251545
## iter 70 value 29.250948
## iter 80 value 29.250644
## final value 29.250642
## converged
## # weights: 30
## initial value 101.429885
## iter 10 value 43.515628
## iter 20 value 34.108701
## iter 30 value 33.388556
## iter 40 value 32.947467
## iter 50 value 32.845407
## iter 60 value 32.841785
## iter 70 value 32.841246
## iter 80 value 32.840794
## final value 32.840779
## converged
## # weights: 30
## initial value 107.159448
## iter 10 value 78.082167
## iter 20 value 41.384080
## iter 30 value 38.313083
## iter 40 value 36.501628
## iter 50 value 35.697330
## iter 60 value 35.373152
## iter 70 value 35.319274
## iter 80 value 35.298602
## iter 90 value 35.297230
## final value 35.297230
## converged
## # weights: 30
## initial value 107.198571
## iter 10 value 53.017985
## iter 20 value 36.788016
## iter 30 value 35.806080
## iter 40 value 34.930265
## iter 50 value 34.656193
## iter 60 value 34.635057
## iter 70 value 34.632329
## iter 80 value 34.631284
## final value 34.631243
## converged
## # weights: 30
## initial value 123.335430
## iter 10 value 46.633187
## iter 20 value 34.793149
## iter 30 value 33.137611
## iter 40 value 32.947083
## iter 50 value 32.916441
## iter 60 value 32.868035
## iter 70 value 32.825336
## iter 80 value 32.822928
## iter 90 value 32.822612
## final value 32.822611
## converged
## # weights: 30
## initial value 98.083825
## iter 10 value 45.359270
## iter 20 value 32.627439
## iter 30 value 32.258574
## iter 40 value 32.206372
## iter 50 value 32.187926
## iter 60 value 32.176890
## iter 70 value 32.175087
## iter 80 value 32.175069
## final value 32.175068
## converged
## # weights: 30
## initial value 102.222835
## iter 10 value 44.955032
## iter 20 value 35.608312
## iter 30 value 34.954348
## iter 40 value 34.753083
## iter 50 value 34.703087
## iter 60 value 34.698048
## iter 70 value 34.697728
## final value 34.697720
## converged
## # weights: 30
## initial value 108.986123
## iter 10 value 47.462559
## iter 20 value 37.484609
## iter 30 value 33.670830
## iter 40 value 32.290331
## iter 50 value 32.060552
## iter 60 value 31.912677
## iter 70 value 31.884550
## iter 80 value 31.878357
## iter 90 value 31.878155
## iter 90 value 31.878155
## iter 90 value 31.878155
## final value 31.878155
## converged
## # weights: 30
## initial value 103.614007
## iter 10 value 49.969698
## iter 20 value 40.235476
## iter 30 value 37.123043
## iter 40 value 36.104189
## iter 50 value 35.498184
## iter 60 value 35.200648
## iter 70 value 35.069614
## iter 80 value 35.052874
## iter 90 value 35.052526
## final value 35.052526
## converged
## # weights: 30
## initial value 100.832064
## iter 10 value 48.094842
## iter 20 value 37.546268
## iter 30 value 35.907724
## iter 40 value 35.554689
## iter 50 value 34.058268
## iter 60 value 33.338087
## iter 70 value 32.857936
## iter 80 value 32.578767
## iter 90 value 32.570203
## iter 100 value 32.569977
## final value 32.569977
## stopped after 100 iterations
## # weights: 30
## initial value 100.431086
## iter 10 value 59.899575
## iter 20 value 35.493799
## iter 30 value 34.917733
## iter 40 value 34.445500
## iter 50 value 34.059238
## iter 60 value 34.034338
## iter 70 value 34.026379
## iter 80 value 34.025591
## final value 34.025582
## converged
## # weights: 30
## initial value 105.063390
## iter 10 value 44.930809
## iter 20 value 31.488770
## iter 30 value 30.539919
## iter 40 value 30.343821
## iter 50 value 30.131551
## iter 60 value 29.910970
## iter 70 value 29.906159
## iter 80 value 29.905036
## final value 29.905019
## converged
## # weights: 30
## initial value 105.174038
## iter 10 value 44.677296
## iter 20 value 37.278535
## iter 30 value 35.851828
## iter 40 value 35.442448
## iter 50 value 35.322088
## iter 60 value 35.319692
## final value 35.319654
## converged
## # weights: 30
## initial value 99.088560
## iter 10 value 51.338614
## iter 20 value 37.919999
## iter 30 value 36.247226
## iter 40 value 36.148094
## iter 50 value 36.145265
## iter 60 value 35.845307
## iter 70 value 34.111926
## iter 80 value 33.033872
## iter 90 value 32.975550
## iter 100 value 32.975128
## final value 32.975128
## stopped after 100 iterations
## # weights: 30
## initial value 127.929440
## iter 10 value 99.269484
## iter 20 value 43.738326
## iter 30 value 32.235920
## iter 40 value 31.299708
## iter 50 value 31.244240
## iter 60 value 31.205527
## iter 70 value 31.092650
## iter 80 value 31.061847
## iter 90 value 31.038937
## iter 100 value 31.038595
## final value 31.038595
## stopped after 100 iterations
## # weights: 30
## initial value 105.391938
## iter 10 value 54.553506
## iter 20 value 35.951629
## iter 30 value 34.036114
## iter 40 value 33.299702
## iter 50 value 32.912639
## iter 60 value 32.713950
## iter 70 value 32.565923
## iter 80 value 32.498843
## final value 32.498162
## converged
## # weights: 30
## initial value 101.916164
## iter 10 value 60.122482
## iter 20 value 36.442201
## iter 30 value 31.737280
## iter 40 value 31.222660
## iter 50 value 31.100141
## iter 60 value 31.061418
## iter 70 value 31.057596
## iter 80 value 31.057403
## final value 31.057392
## converged
## # weights: 30
## initial value 100.019807
## iter 10 value 50.453978
## iter 20 value 34.640083
## iter 30 value 33.193100
## iter 40 value 33.048811
## iter 50 value 32.743180
## iter 60 value 32.681233
## iter 70 value 32.623055
## iter 80 value 32.618569
## final value 32.618560
## converged
## # weights: 30
## initial value 99.828154
## iter 10 value 63.650257
## iter 20 value 40.878525
## iter 30 value 35.708467
## iter 40 value 35.219695
## iter 50 value 35.089435
## iter 60 value 34.620647
## iter 70 value 34.540113
## iter 80 value 34.375723
## iter 90 value 33.609108
## iter 100 value 33.496871
## final value 33.496871
## stopped after 100 iterations
## # weights: 30
## initial value 103.953415
## iter 10 value 56.479620
## iter 20 value 36.404885
## iter 30 value 35.230850
## iter 40 value 34.779425
## iter 50 value 34.464815
## iter 60 value 34.348600
## iter 70 value 34.283374
## iter 80 value 34.249919
## iter 90 value 34.249128
## final value 34.249072
## converged
## [Tune-y] 3: mmce.test.mean=0.0516667; time: 0.0 min
## [Tune-x] 4: size=2; decay=0.0513
## # weights: 21
## initial value 104.000438
## iter 10 value 50.848541
## iter 20 value 35.847336
## iter 30 value 35.675386
## iter 40 value 35.674732
## final value 35.674730
## converged
## # weights: 21
## initial value 102.427716
## iter 10 value 51.129488
## iter 20 value 37.992838
## iter 30 value 36.487567
## iter 40 value 36.199705
## iter 50 value 36.103085
## final value 36.103024
## converged
## # weights: 21
## initial value 110.398442
## iter 10 value 84.087825
## iter 20 value 40.568558
## iter 30 value 38.177886
## iter 40 value 37.146954
## iter 50 value 35.850791
## iter 60 value 33.081139
## iter 70 value 32.530496
## final value 32.529854
## converged
## # weights: 21
## initial value 106.485235
## iter 10 value 67.737095
## iter 20 value 40.891643
## iter 30 value 37.847434
## iter 40 value 34.794265
## iter 50 value 34.627184
## iter 60 value 34.610238
## final value 34.610229
## converged
## # weights: 21
## initial value 105.845169
## iter 10 value 45.251163
## iter 20 value 39.481000
## iter 30 value 35.712567
## iter 40 value 32.918069
## iter 50 value 32.892295
## final value 32.889845
## converged
## # weights: 21
## initial value 99.141160
## iter 10 value 39.268347
## iter 20 value 35.913979
## iter 30 value 35.454733
## iter 40 value 35.431055
## final value 35.431005
## converged
## # weights: 21
## initial value 99.676919
## iter 10 value 74.565518
## iter 20 value 34.779652
## iter 30 value 34.197177
## iter 40 value 34.086386
## iter 50 value 34.081858
## iter 60 value 34.081345
## final value 34.081344
## converged
## # weights: 21
## initial value 107.455705
## iter 10 value 69.742155
## iter 20 value 41.492737
## iter 30 value 38.482574
## iter 40 value 36.680038
## iter 50 value 36.376454
## iter 60 value 36.268440
## final value 36.268319
## converged
## # weights: 21
## initial value 98.004870
## iter 10 value 49.836161
## iter 20 value 39.945465
## iter 30 value 37.056270
## iter 40 value 36.922362
## iter 50 value 36.919747
## iter 60 value 36.917506
## final value 36.917494
## converged
## # weights: 21
## initial value 106.318876
## iter 10 value 57.251366
## iter 20 value 40.948523
## iter 30 value 39.974190
## iter 40 value 35.408705
## iter 50 value 35.013856
## iter 60 value 35.012875
## final value 35.012873
## converged
## # weights: 21
## initial value 108.992069
## iter 10 value 47.567940
## iter 20 value 31.879208
## iter 30 value 31.769020
## iter 40 value 31.767327
## final value 31.767321
## converged
## # weights: 21
## initial value 99.504503
## iter 10 value 58.945320
## iter 20 value 41.699676
## iter 30 value 37.345502
## iter 40 value 32.663329
## iter 50 value 32.427457
## iter 60 value 32.413274
## final value 32.413271
## converged
## # weights: 21
## initial value 133.711085
## iter 10 value 96.774005
## iter 20 value 43.336549
## iter 30 value 37.116052
## iter 40 value 36.258367
## iter 50 value 36.169719
## iter 60 value 36.161935
## final value 36.161931
## converged
## # weights: 21
## initial value 104.406434
## iter 10 value 45.527309
## iter 20 value 34.909793
## iter 30 value 33.657829
## iter 40 value 33.638803
## iter 50 value 33.637445
## final value 33.637430
## converged
## # weights: 21
## initial value 113.916625
## iter 10 value 67.468432
## iter 20 value 39.347111
## iter 30 value 35.024708
## iter 40 value 34.264106
## iter 50 value 34.236507
## iter 60 value 34.236000
## final value 34.235999
## converged
## # weights: 21
## initial value 113.183681
## iter 10 value 85.287970
## iter 20 value 37.415159
## iter 30 value 35.687115
## iter 40 value 35.530371
## final value 35.519348
## converged
## # weights: 21
## initial value 104.878362
## iter 10 value 54.916582
## iter 20 value 37.883668
## iter 30 value 34.136200
## iter 40 value 34.002161
## iter 50 value 33.995257
## iter 60 value 33.994627
## iter 60 value 33.994627
## iter 60 value 33.994627
## final value 33.994627
## converged
## # weights: 21
## initial value 100.741369
## iter 10 value 54.725599
## iter 20 value 40.048571
## iter 30 value 38.619867
## iter 40 value 37.732935
## iter 50 value 37.722151
## final value 37.721966
## converged
## # weights: 21
## initial value 116.031509
## iter 10 value 79.438230
## iter 20 value 35.899978
## iter 30 value 32.475271
## iter 40 value 32.318264
## iter 50 value 32.314514
## final value 32.314472
## converged
## # weights: 21
## initial value 108.425013
## iter 10 value 51.806634
## iter 20 value 41.035120
## iter 30 value 37.666759
## iter 40 value 36.376348
## iter 50 value 36.279443
## final value 36.274760
## converged
## # weights: 21
## initial value 99.280351
## iter 10 value 48.950232
## iter 20 value 31.767571
## iter 30 value 31.210820
## iter 40 value 31.075545
## final value 31.064648
## converged
## # weights: 21
## initial value 107.528995
## iter 10 value 64.753875
## iter 20 value 37.109607
## iter 30 value 35.728821
## iter 40 value 35.493879
## iter 50 value 35.492044
## iter 50 value 35.492044
## iter 50 value 35.492044
## final value 35.492044
## converged
## # weights: 21
## initial value 104.754947
## iter 10 value 72.461777
## iter 20 value 45.709853
## iter 30 value 42.949253
## iter 40 value 41.950213
## iter 50 value 40.464059
## iter 60 value 38.337931
## iter 70 value 38.275502
## final value 38.275458
## converged
## # weights: 21
## initial value 120.211556
## iter 10 value 95.288635
## iter 20 value 43.689952
## iter 30 value 39.422112
## iter 40 value 37.755306
## iter 50 value 37.648721
## final value 37.645034
## converged
## # weights: 21
## initial value 100.439192
## iter 10 value 58.283798
## iter 20 value 37.431866
## iter 30 value 34.488304
## iter 40 value 33.918433
## iter 50 value 33.902618
## final value 33.901993
## converged
## # weights: 21
## initial value 111.132865
## iter 10 value 55.780963
## iter 20 value 41.122737
## iter 30 value 40.170213
## iter 40 value 39.278845
## iter 50 value 36.369999
## iter 60 value 35.123025
## iter 70 value 35.116841
## iter 70 value 35.116841
## iter 70 value 35.116841
## final value 35.116841
## converged
## # weights: 21
## initial value 109.882301
## iter 10 value 70.028900
## iter 20 value 41.274490
## iter 30 value 39.061499
## iter 40 value 35.941985
## iter 50 value 35.684725
## iter 60 value 35.652783
## final value 35.652767
## converged
## # weights: 21
## initial value 105.096548
## iter 10 value 49.140096
## iter 20 value 40.950410
## iter 30 value 38.534513
## iter 40 value 33.971587
## iter 50 value 33.452968
## iter 60 value 33.420300
## final value 33.420291
## converged
## # weights: 21
## initial value 104.827521
## iter 10 value 57.021199
## iter 20 value 41.189796
## iter 30 value 37.037350
## iter 40 value 36.656816
## iter 50 value 36.652941
## iter 60 value 36.651758
## final value 36.651757
## converged
## # weights: 21
## initial value 101.850866
## iter 10 value 48.318970
## iter 20 value 34.606937
## iter 30 value 34.339643
## iter 40 value 34.293885
## final value 34.291722
## converged
## # weights: 21
## initial value 101.014266
## iter 10 value 70.027269
## iter 20 value 37.260300
## iter 30 value 35.438442
## iter 40 value 35.021180
## iter 50 value 34.996677
## final value 34.996503
## converged
## # weights: 21
## initial value 124.930926
## iter 10 value 79.049002
## iter 20 value 39.112790
## iter 30 value 34.498642
## iter 40 value 32.560374
## iter 50 value 31.263198
## iter 60 value 31.156059
## final value 31.155771
## converged
## # weights: 21
## initial value 105.716977
## iter 10 value 60.749590
## iter 20 value 38.926233
## iter 30 value 36.870818
## iter 40 value 36.850763
## final value 36.848043
## converged
## # weights: 21
## initial value 107.268384
## iter 10 value 50.252415
## iter 20 value 40.022012
## iter 30 value 38.285801
## iter 40 value 34.751612
## iter 50 value 33.972760
## iter 60 value 33.959306
## final value 33.959304
## converged
## # weights: 21
## initial value 114.798726
## iter 10 value 67.076961
## iter 20 value 35.930860
## iter 30 value 33.999384
## iter 40 value 33.989637
## iter 50 value 33.988524
## iter 50 value 33.988524
## iter 50 value 33.988524
## final value 33.988524
## converged
## # weights: 21
## initial value 106.187701
## iter 10 value 48.466384
## iter 20 value 36.573346
## iter 30 value 35.648027
## iter 40 value 35.423278
## iter 50 value 35.352993
## final value 35.352877
## converged
## # weights: 21
## initial value 106.701211
## iter 10 value 54.243829
## iter 20 value 32.912682
## iter 30 value 32.731157
## iter 40 value 32.719074
## iter 50 value 32.716549
## iter 60 value 32.716096
## final value 32.716095
## converged
## # weights: 21
## initial value 99.549668
## iter 10 value 59.839953
## iter 20 value 41.047109
## iter 30 value 37.097300
## iter 40 value 36.107616
## iter 50 value 35.354914
## iter 60 value 35.320328
## final value 35.320320
## converged
## # weights: 21
## initial value 100.264139
## iter 10 value 61.389332
## iter 20 value 35.161165
## iter 30 value 34.662839
## iter 40 value 34.649663
## iter 50 value 34.649239
## final value 34.649221
## converged
## # weights: 21
## initial value 99.789374
## iter 10 value 52.307130
## iter 20 value 36.725684
## iter 30 value 36.150637
## iter 40 value 36.009400
## iter 50 value 36.009003
## iter 60 value 36.008820
## iter 60 value 36.008820
## iter 60 value 36.008820
## final value 36.008820
## converged
## [Tune-y] 4: mmce.test.mean=0.0508333; time: 0.0 min
## [Tune-x] 5: size=2; decay=0.0529
## # weights: 21
## initial value 100.862360
## iter 10 value 64.953351
## iter 20 value 40.551040
## iter 30 value 36.892632
## iter 40 value 36.153009
## iter 50 value 36.144807
## final value 36.144654
## converged
## # weights: 21
## initial value 107.254459
## iter 10 value 54.480740
## iter 20 value 36.483006
## iter 30 value 34.950953
## iter 40 value 34.751849
## iter 50 value 34.735041
## final value 34.735029
## converged
## # weights: 21
## initial value 102.941427
## iter 10 value 50.277769
## iter 20 value 35.897986
## iter 30 value 33.737163
## iter 40 value 33.150503
## final value 33.145993
## converged
## # weights: 21
## initial value 101.975211
## iter 10 value 52.908119
## iter 20 value 37.444683
## iter 30 value 37.257785
## iter 40 value 37.251314
## final value 37.251299
## converged
## # weights: 21
## initial value 99.676401
## iter 10 value 53.383486
## iter 20 value 35.081547
## iter 30 value 33.981463
## iter 40 value 33.957842
## final value 33.957835
## converged
## # weights: 21
## initial value 105.408463
## iter 10 value 46.035285
## iter 20 value 36.251377
## iter 30 value 35.946304
## iter 40 value 35.925556
## iter 50 value 35.919126
## final value 35.919109
## converged
## # weights: 21
## initial value 100.723575
## iter 10 value 45.672297
## iter 20 value 36.585837
## iter 30 value 34.920237
## iter 40 value 34.598787
## iter 50 value 34.587848
## iter 60 value 34.586061
## final value 34.586058
## converged
## # weights: 21
## initial value 114.638812
## iter 10 value 64.960517
## iter 20 value 35.122117
## iter 30 value 35.103092
## iter 40 value 35.101742
## final value 35.101726
## converged
## # weights: 21
## initial value 116.342383
## iter 10 value 57.417554
## iter 20 value 38.952047
## iter 30 value 38.687793
## iter 40 value 38.579466
## iter 50 value 38.571170
## iter 60 value 38.570168
## iter 60 value 38.570167
## iter 60 value 38.570167
## final value 38.570167
## converged
## # weights: 21
## initial value 102.349882
## iter 10 value 66.044261
## iter 20 value 36.880034
## iter 30 value 35.507174
## iter 40 value 35.501978
## final value 35.501486
## converged
## # weights: 21
## initial value 98.534505
## iter 10 value 45.374384
## iter 20 value 35.275671
## iter 30 value 34.139452
## iter 40 value 34.058438
## final value 34.057949
## converged
## # weights: 21
## initial value 107.623702
## iter 10 value 78.877374
## iter 20 value 36.289263
## iter 30 value 35.408341
## iter 40 value 35.194876
## final value 35.193530
## converged
## # weights: 21
## initial value 101.465299
## iter 10 value 55.859048
## iter 20 value 37.936314
## iter 30 value 37.321700
## iter 40 value 37.297750
## iter 50 value 37.297643
## final value 37.297635
## converged
## # weights: 21
## initial value 115.058681
## iter 10 value 44.729700
## iter 20 value 34.898371
## iter 30 value 34.651282
## iter 40 value 34.646305
## iter 50 value 34.645306
## final value 34.645279
## converged
## # weights: 21
## initial value 107.826618
## iter 10 value 69.238554
## iter 20 value 39.963655
## iter 30 value 35.406281
## iter 40 value 34.838021
## iter 50 value 34.818469
## iter 50 value 34.818469
## iter 50 value 34.818469
## final value 34.818469
## converged
## # weights: 21
## initial value 107.257231
## iter 10 value 52.021501
## iter 20 value 38.593808
## iter 30 value 35.421529
## iter 40 value 34.787835
## iter 50 value 34.759014
## iter 60 value 34.754060
## final value 34.754050
## converged
## # weights: 21
## initial value 99.162026
## iter 10 value 47.153686
## iter 20 value 38.816275
## iter 30 value 36.386405
## iter 40 value 36.258750
## iter 50 value 36.251158
## final value 36.251148
## converged
## # weights: 21
## initial value 112.633330
## iter 10 value 58.062931
## iter 20 value 37.376517
## iter 30 value 36.152688
## iter 40 value 36.119992
## iter 50 value 36.119841
## final value 36.119834
## converged
## # weights: 21
## initial value 113.657742
## iter 10 value 76.228487
## iter 20 value 39.912643
## iter 30 value 39.281470
## iter 40 value 34.345377
## iter 50 value 33.569635
## iter 60 value 33.478392
## iter 70 value 32.895059
## iter 80 value 32.843756
## final value 32.843745
## converged
## # weights: 21
## initial value 102.334106
## iter 10 value 46.207843
## iter 20 value 35.111449
## iter 30 value 35.057032
## final value 35.056020
## converged
## # weights: 21
## initial value 104.307902
## iter 10 value 69.726561
## iter 20 value 38.035255
## iter 30 value 34.267165
## iter 40 value 33.692295
## iter 50 value 33.015780
## iter 60 value 33.013640
## iter 60 value 33.013640
## iter 60 value 33.013640
## final value 33.013640
## converged
## # weights: 21
## initial value 102.707303
## iter 10 value 52.377290
## iter 20 value 41.498343
## iter 30 value 35.471504
## iter 40 value 35.031880
## iter 50 value 34.920855
## iter 60 value 34.918037
## final value 34.918033
## converged
## # weights: 21
## initial value 105.851044
## iter 10 value 51.025746
## iter 20 value 38.601540
## iter 30 value 37.464577
## iter 40 value 37.453495
## iter 50 value 37.452789
## final value 37.452778
## converged
## # weights: 21
## initial value 106.341031
## iter 10 value 58.200644
## iter 20 value 39.784087
## iter 30 value 36.423010
## iter 40 value 36.166243
## iter 50 value 36.165220
## final value 36.165148
## converged
## # weights: 21
## initial value 116.080121
## iter 10 value 57.818731
## iter 20 value 38.156617
## iter 30 value 35.281346
## iter 40 value 34.486879
## iter 50 value 34.415312
## final value 34.415217
## converged
## # weights: 21
## initial value 100.572486
## iter 10 value 45.135235
## iter 20 value 37.983198
## iter 30 value 35.909803
## iter 40 value 35.813287
## final value 35.812541
## converged
## # weights: 21
## initial value 107.862879
## iter 10 value 49.717950
## iter 20 value 36.322270
## iter 30 value 36.181754
## iter 40 value 36.132250
## iter 50 value 36.119890
## final value 36.119836
## converged
## # weights: 21
## initial value 123.082826
## iter 10 value 88.755395
## iter 20 value 42.672471
## iter 30 value 35.934157
## iter 40 value 35.230242
## iter 50 value 35.217589
## final value 35.217491
## converged
## # weights: 21
## initial value 104.958830
## iter 10 value 52.028173
## iter 20 value 38.261698
## iter 30 value 36.614455
## iter 40 value 36.477565
## iter 50 value 36.456786
## final value 36.455522
## converged
## # weights: 21
## initial value 111.471590
## iter 10 value 98.926656
## iter 20 value 60.036727
## iter 30 value 35.036965
## iter 40 value 34.515407
## iter 50 value 34.316781
## iter 60 value 34.152943
## iter 70 value 34.150187
## iter 70 value 34.150187
## iter 70 value 34.150187
## final value 34.150187
## converged
## # weights: 21
## initial value 99.773018
## iter 10 value 43.557862
## iter 20 value 35.738912
## iter 30 value 35.470482
## iter 40 value 35.467679
## final value 35.467637
## converged
## # weights: 21
## initial value 108.406435
## iter 10 value 61.484961
## iter 20 value 32.294228
## iter 30 value 31.878804
## iter 40 value 31.750423
## iter 50 value 31.689032
## final value 31.686307
## converged
## # weights: 21
## initial value 114.424687
## iter 10 value 56.611537
## iter 20 value 43.141412
## iter 30 value 38.861488
## iter 40 value 38.634576
## iter 50 value 38.603129
## final value 38.602314
## converged
## # weights: 21
## initial value 98.654160
## iter 10 value 59.637955
## iter 20 value 40.033756
## iter 30 value 36.504474
## iter 40 value 34.678074
## iter 50 value 34.446493
## final value 34.445673
## converged
## # weights: 21
## initial value 109.764571
## iter 10 value 68.760242
## iter 20 value 38.538981
## iter 30 value 37.273895
## iter 40 value 35.102427
## iter 50 value 34.876616
## iter 60 value 34.876112
## final value 34.876109
## converged
## # weights: 21
## initial value 107.165247
## iter 10 value 57.371296
## iter 20 value 40.494194
## iter 30 value 36.432587
## iter 40 value 34.139931
## iter 50 value 34.105835
## iter 60 value 34.105474
## iter 60 value 34.105474
## iter 60 value 34.105474
## final value 34.105474
## converged
## # weights: 21
## initial value 109.335505
## iter 10 value 54.306988
## iter 20 value 34.280038
## iter 30 value 33.701822
## iter 40 value 32.832083
## iter 50 value 32.704948
## iter 60 value 32.670340
## final value 32.670324
## converged
## # weights: 21
## initial value 103.641654
## iter 10 value 43.241178
## iter 20 value 35.571223
## iter 30 value 34.123677
## iter 40 value 34.069779
## iter 50 value 34.068608
## final value 34.068548
## converged
## # weights: 21
## initial value 100.183390
## iter 10 value 53.120164
## iter 20 value 36.112688
## iter 30 value 35.753127
## iter 40 value 35.752693
## final value 35.752675
## converged
## # weights: 21
## initial value 100.894318
## iter 10 value 62.856389
## iter 20 value 45.310833
## iter 30 value 40.393090
## iter 40 value 37.928037
## iter 50 value 37.837428
## final value 37.836804
## converged
## [Tune-y] 5: mmce.test.mean=0.0483333; time: 0.0 min
## [Tune-x] 6: size=2; decay=0.0489
## # weights: 21
## initial value 117.861190
## iter 10 value 90.250219
## iter 20 value 40.742923
## iter 30 value 39.022962
## iter 40 value 35.361635
## iter 50 value 34.996072
## iter 60 value 34.937848
## iter 70 value 34.935984
## iter 70 value 34.935984
## iter 70 value 34.935984
## final value 34.935984
## converged
## # weights: 21
## initial value 106.029194
## iter 10 value 48.466427
## iter 20 value 39.708246
## iter 30 value 38.490421
## iter 40 value 36.670063
## iter 50 value 35.307432
## iter 60 value 35.285534
## final value 35.285531
## converged
## # weights: 21
## initial value 106.849317
## iter 10 value 43.415825
## iter 20 value 32.869017
## iter 30 value 31.624670
## iter 40 value 31.561002
## final value 31.560960
## converged
## # weights: 21
## initial value 105.587009
## iter 10 value 51.691826
## iter 20 value 36.692726
## iter 30 value 35.524574
## iter 40 value 35.465896
## iter 50 value 35.465345
## final value 35.465338
## converged
## # weights: 21
## initial value 120.143330
## iter 10 value 72.997709
## iter 20 value 37.570891
## iter 30 value 35.107413
## iter 40 value 33.545031
## iter 50 value 32.163737
## iter 60 value 32.059405
## final value 32.059394
## converged
## # weights: 21
## initial value 100.603201
## iter 10 value 41.441109
## iter 20 value 35.366440
## iter 30 value 34.112787
## iter 40 value 34.095898
## iter 50 value 34.093070
## final value 34.093068
## converged
## # weights: 21
## initial value 106.297463
## iter 10 value 53.175482
## iter 20 value 36.927581
## iter 30 value 35.347907
## iter 40 value 35.036263
## final value 34.963294
## converged
## # weights: 21
## initial value 105.619666
## iter 10 value 58.745873
## iter 20 value 39.208407
## iter 30 value 36.240766
## iter 40 value 35.738948
## iter 50 value 35.725860
## iter 60 value 35.724692
## iter 60 value 35.724691
## iter 60 value 35.724691
## final value 35.724691
## converged
## # weights: 21
## initial value 105.583887
## iter 10 value 65.491055
## iter 20 value 36.513906
## iter 30 value 36.157907
## iter 40 value 36.135490
## iter 50 value 36.131031
## iter 60 value 36.130601
## final value 36.130597
## converged
## # weights: 21
## initial value 113.978501
## iter 10 value 52.173477
## iter 20 value 41.085604
## iter 30 value 36.491546
## iter 40 value 36.011331
## iter 50 value 36.004845
## final value 36.004586
## converged
## # weights: 21
## initial value 105.133454
## iter 10 value 60.851301
## iter 20 value 31.329019
## iter 30 value 31.009777
## iter 40 value 30.984210
## final value 30.982406
## converged
## # weights: 21
## initial value 106.497975
## iter 10 value 60.249416
## iter 20 value 32.296689
## iter 30 value 31.747834
## iter 40 value 31.610813
## iter 50 value 31.593404
## final value 31.593381
## converged
## # weights: 21
## initial value 101.208259
## iter 10 value 67.447495
## iter 20 value 37.185321
## iter 30 value 35.425818
## iter 40 value 35.404260
## final value 35.403925
## converged
## # weights: 21
## initial value 100.451591
## iter 10 value 48.968317
## iter 20 value 36.763274
## iter 30 value 34.622080
## iter 40 value 34.349873
## iter 50 value 34.309222
## final value 34.309144
## converged
## # weights: 21
## initial value 100.650602
## iter 10 value 52.740871
## iter 20 value 42.093963
## iter 30 value 32.946364
## iter 40 value 32.285787
## iter 50 value 32.155022
## iter 60 value 32.151185
## final value 32.151162
## converged
## # weights: 21
## initial value 98.768157
## iter 10 value 48.385739
## iter 20 value 33.368215
## iter 30 value 32.798120
## iter 40 value 32.756773
## final value 32.756531
## converged
## # weights: 21
## initial value 101.478353
## iter 10 value 52.312703
## iter 20 value 38.295034
## iter 30 value 34.842349
## iter 40 value 33.820825
## iter 50 value 33.782484
## final value 33.782128
## converged
## # weights: 21
## initial value 111.550391
## iter 10 value 65.220065
## iter 20 value 42.870568
## iter 30 value 37.915712
## iter 40 value 36.829917
## iter 50 value 36.627020
## iter 60 value 36.608771
## final value 36.608770
## converged
## # weights: 21
## initial value 107.525522
## iter 10 value 46.562441
## iter 20 value 33.120633
## iter 30 value 32.376418
## iter 40 value 32.131337
## iter 50 value 32.125770
## iter 60 value 32.125726
## final value 32.125725
## converged
## # weights: 21
## initial value 102.744198
## iter 10 value 66.948651
## iter 20 value 40.331163
## iter 30 value 38.410295
## iter 40 value 36.310764
## iter 50 value 35.528870
## iter 60 value 35.431026
## final value 35.430971
## converged
## # weights: 21
## initial value 101.111982
## iter 10 value 56.462157
## iter 20 value 35.550679
## iter 30 value 31.062562
## iter 40 value 29.885778
## iter 50 value 29.628286
## iter 60 value 29.624819
## iter 60 value 29.624818
## iter 60 value 29.624818
## final value 29.624818
## converged
## # weights: 21
## initial value 100.897310
## iter 10 value 48.549823
## iter 20 value 33.375943
## iter 30 value 33.222116
## iter 40 value 33.220556
## final value 33.220540
## converged
## # weights: 21
## initial value 100.757238
## iter 10 value 75.770837
## iter 20 value 38.507960
## iter 30 value 37.909705
## iter 40 value 37.734341
## final value 37.732562
## converged
## # weights: 21
## initial value 105.558485
## iter 10 value 71.933908
## iter 20 value 40.920310
## iter 30 value 40.108249
## iter 40 value 37.587752
## iter 50 value 35.568456
## iter 60 value 34.891830
## final value 34.891419
## converged
## # weights: 21
## initial value 106.674563
## iter 10 value 42.910153
## iter 20 value 34.317607
## iter 30 value 33.809851
## iter 40 value 33.731234
## iter 50 value 33.716329
## final value 33.716327
## converged
## # weights: 21
## initial value 104.522037
## iter 10 value 51.747258
## iter 20 value 39.299911
## iter 30 value 35.774372
## iter 40 value 34.593344
## iter 50 value 34.231249
## iter 60 value 34.188912
## final value 34.188907
## converged
## # weights: 21
## initial value 118.934604
## iter 10 value 81.500941
## iter 20 value 35.410736
## iter 30 value 35.066602
## iter 40 value 34.938320
## iter 50 value 34.919249
## iter 60 value 34.918731
## iter 60 value 34.918731
## iter 60 value 34.918731
## final value 34.918731
## converged
## # weights: 21
## initial value 102.416168
## iter 10 value 56.403492
## iter 20 value 35.603010
## iter 30 value 32.772077
## iter 40 value 32.624629
## final value 32.623873
## converged
## # weights: 21
## initial value 100.447528
## iter 10 value 46.796222
## iter 20 value 39.865526
## iter 30 value 37.788226
## iter 40 value 37.473390
## iter 50 value 37.428652
## final value 37.427723
## converged
## # weights: 21
## initial value 101.723900
## iter 10 value 38.734327
## iter 20 value 33.814560
## iter 30 value 33.638394
## iter 40 value 33.516836
## iter 50 value 33.456297
## iter 60 value 33.455376
## final value 33.455359
## converged
## # weights: 21
## initial value 99.974782
## iter 10 value 57.879650
## iter 20 value 38.740902
## iter 30 value 36.587949
## iter 40 value 36.042494
## iter 50 value 36.033150
## final value 36.033044
## converged
## # weights: 21
## initial value 103.013515
## iter 10 value 59.086022
## iter 20 value 36.096015
## iter 30 value 31.469181
## iter 40 value 30.486229
## iter 50 value 30.320742
## final value 30.320578
## converged
## # weights: 21
## initial value 103.743890
## iter 10 value 47.326807
## iter 20 value 40.628897
## iter 30 value 38.043742
## iter 40 value 37.689187
## iter 50 value 37.564948
## iter 60 value 37.556020
## final value 37.556016
## converged
## # weights: 21
## initial value 101.430802
## iter 10 value 54.959616
## iter 20 value 39.914886
## iter 30 value 37.570304
## iter 40 value 35.858085
## iter 50 value 35.313224
## iter 60 value 35.236166
## final value 35.236147
## converged
## # weights: 21
## initial value 107.061360
## iter 10 value 81.303849
## iter 20 value 34.618984
## iter 30 value 31.533948
## iter 40 value 31.404199
## iter 50 value 31.401084
## final value 31.400995
## converged
## # weights: 21
## initial value 110.955782
## iter 10 value 71.959721
## iter 20 value 33.737680
## iter 30 value 32.875519
## iter 40 value 32.804143
## iter 50 value 32.793238
## iter 50 value 32.793238
## final value 32.793238
## converged
## # weights: 21
## initial value 103.713361
## iter 10 value 58.606061
## iter 20 value 37.595633
## iter 30 value 32.990052
## iter 40 value 31.978047
## iter 50 value 31.890762
## iter 60 value 31.888101
## final value 31.888085
## converged
## # weights: 21
## initial value 102.503762
## iter 10 value 56.618728
## iter 20 value 38.240872
## iter 30 value 33.845350
## iter 40 value 33.110492
## iter 50 value 32.848715
## iter 60 value 32.835273
## iter 60 value 32.835273
## iter 60 value 32.835273
## final value 32.835273
## converged
## # weights: 21
## initial value 103.516994
## iter 10 value 64.022834
## iter 20 value 43.091371
## iter 30 value 40.513197
## iter 40 value 34.595830
## iter 50 value 33.872557
## iter 60 value 33.845544
## final value 33.845540
## converged
## # weights: 21
## initial value 119.125959
## iter 10 value 72.927327
## iter 20 value 38.987999
## iter 30 value 36.871837
## iter 40 value 36.789092
## iter 50 value 36.778489
## final value 36.777773
## converged
## [Tune-y] 6: mmce.test.mean=0.0516667; time: 0.0 min
## [Tune] Result: size=2; decay=0.0529 : mmce.test.mean=0.0483333
tictoc::toc()
## 4.21 sec elapsed
task <- makeClassifTask(data = knowledge_train_data, target = "UNS")
## Warning in makeTask(type = type, data = data, weights = weights, blocking
## = blocking, : Provided data is not a pure data.frame but from class tbl_df,
## hence it will be converted.
lrn <- makeLearner(cl = "classif.rpart", fix.factors.prediction = TRUE)
param_set <- makeParamSet(makeIntegerParam("minsplit", lower = 1, upper = 30),
makeIntegerParam("minbucket", lower = 1, upper = 30),
makeIntegerParam("maxdepth", lower = 3, upper = 10)
)
ctrl_random <- makeTuneControlRandom(maxit = 10)
# Create holdout sampling
holdout <- makeResampleDesc("Holdout")
# Perform tuning
lrn_tune <- tuneParams(learner = lrn, task = task, resampling = holdout,
control = ctrl_random, par.set = param_set
)
## [Tune] Started tuning learner classif.rpart for parameter set:
## Type len Def Constr Req Tunable Trafo
## minsplit integer - - 1 to 30 - TRUE -
## minbucket integer - - 1 to 30 - TRUE -
## maxdepth integer - - 3 to 10 - TRUE -
## With control class: TuneControlRandom
## Imputation value: 1
## [Tune-x] 1: minsplit=5; minbucket=23; maxdepth=7
## [Tune-y] 1: mmce.test.mean=0.1000000; time: 0.0 min
## [Tune-x] 2: minsplit=20; minbucket=29; maxdepth=6
## [Tune-y] 2: mmce.test.mean=0.4500000; time: 0.0 min
## [Tune-x] 3: minsplit=6; minbucket=29; maxdepth=9
## [Tune-y] 3: mmce.test.mean=0.4500000; time: 0.0 min
## [Tune-x] 4: minsplit=29; minbucket=17; maxdepth=7
## [Tune-y] 4: mmce.test.mean=0.1000000; time: 0.0 min
## [Tune-x] 5: minsplit=20; minbucket=1; maxdepth=8
## [Tune-y] 5: mmce.test.mean=0.0750000; time: 0.0 min
## [Tune-x] 6: minsplit=15; minbucket=16; maxdepth=10
## [Tune-y] 6: mmce.test.mean=0.1000000; time: 0.0 min
## [Tune-x] 7: minsplit=11; minbucket=5; maxdepth=3
## [Tune-y] 7: mmce.test.mean=0.0750000; time: 0.0 min
## [Tune-x] 8: minsplit=10; minbucket=9; maxdepth=8
## [Tune-y] 8: mmce.test.mean=0.1000000; time: 0.0 min
## [Tune-x] 9: minsplit=21; minbucket=11; maxdepth=7
## [Tune-y] 9: mmce.test.mean=0.1000000; time: 0.0 min
## [Tune-x] 10: minsplit=19; minbucket=14; maxdepth=9
## [Tune-y] 10: mmce.test.mean=0.1000000; time: 0.0 min
## [Tune] Result: minsplit=11; minbucket=5; maxdepth=3 : mmce.test.mean=0.0750000
# Generate hyperparameter effect data
hyperpar_effects <- generateHyperParsEffectData(lrn_tune, partial.dep = TRUE)
# Plot hyperparameter effects
plotHyperParsEffect(hyperpar_effects, partial.dep.learn = "regr.glm",
x = "minsplit", y = "mmce.test.mean", z = "maxdepth", plot.type = "line"
)
## Loading required package: mmpf
task <- makeClassifTask(data = knowledge_train_data, target = "UNS")
## Warning in makeTask(type = type, data = data, weights = weights, blocking
## = blocking, : Provided data is not a pure data.frame but from class tbl_df,
## hence it will be converted.
lrn <- makeLearner(cl = "classif.nnet", fix.factors.prediction = TRUE)
param_set <- makeParamSet(makeIntegerParam("size", lower = 1, upper = 5),
makeIntegerParam("maxit", lower = 1, upper = 300),
makeNumericParam("decay", lower = 0.0001, upper = 1)
)
ctrl_random <- makeTuneControlRandom(maxit = 10)
# Create holdout sampling
holdout <- makeResampleDesc("Holdout", predict = "both")
# Perform tuning
lrn_tune <- tuneParams(learner = lrn, task = task, resampling = holdout, control = ctrl_random,
par.set = param_set,
measures = list(mmce, setAggregation(mmce, train.sd),
acc, setAggregation(acc, train.sd)
)
)
## [Tune] Started tuning learner classif.nnet for parameter set:
## Type len Def Constr Req Tunable Trafo
## size integer - - 1 to 5 - TRUE -
## maxit integer - - 1 to 300 - TRUE -
## decay numeric - - 0.0001 to 1 - TRUE -
## With control class: TuneControlRandom
## Imputation value: 1Imputation value: InfImputation value: -0Imputation value: Inf
## [Tune-x] 1: size=3; maxit=291; decay=0.337
## # weights: 30
## initial value 90.017712
## iter 10 value 69.909105
## iter 20 value 68.519082
## iter 30 value 68.489080
## final value 68.488869
## converged
## [Tune-y] 1: mmce.test.mean=0.0750000,mmce.train.sd= NA,acc.test.mean=0.9250000,acc.train.sd= NA; time: 0.0 min
## [Tune-x] 2: size=4; maxit=297; decay=0.926
## # weights: 39
## initial value 102.874652
## iter 10 value 86.129571
## iter 20 value 85.812356
## iter 30 value 85.587055
## iter 40 value 85.492677
## final value 85.492517
## converged
## [Tune-y] 2: mmce.test.mean=0.3000000,mmce.train.sd= NA,acc.test.mean=0.7000000,acc.train.sd= NA; time: 0.0 min
## [Tune-x] 3: size=3; maxit=107; decay=0.364
## # weights: 30
## initial value 90.002254
## iter 10 value 73.795890
## iter 20 value 70.600815
## iter 30 value 70.471580
## iter 40 value 70.457018
## final value 70.456414
## converged
## [Tune-y] 3: mmce.test.mean=0.0750000,mmce.train.sd= NA,acc.test.mean=0.9250000,acc.train.sd= NA; time: 0.0 min
## [Tune-x] 4: size=1; maxit=98; decay=0.888
## # weights: 12
## initial value 97.025873
## iter 10 value 86.577790
## final value 86.494148
## converged
## [Tune-y] 4: mmce.test.mean=0.3000000,mmce.train.sd= NA,acc.test.mean=0.7000000,acc.train.sd= NA; time: 0.0 min
## [Tune-x] 5: size=3; maxit=214; decay=0.438
## # weights: 30
## initial value 110.918770
## iter 10 value 81.202165
## iter 20 value 75.915193
## iter 30 value 74.361787
## iter 40 value 74.182732
## iter 50 value 74.168126
## final value 74.168124
## converged
## [Tune-y] 5: mmce.test.mean=0.1250000,mmce.train.sd= NA,acc.test.mean=0.8750000,acc.train.sd= NA; time: 0.0 min
## [Tune-x] 6: size=2; maxit=152; decay=0.0457
## # weights: 21
## initial value 89.898037
## iter 10 value 38.842565
## iter 20 value 29.586591
## iter 30 value 29.071818
## iter 40 value 28.994512
## iter 50 value 28.991637
## final value 28.991635
## converged
## [Tune-y] 6: mmce.test.mean=0.0500000,mmce.train.sd= NA,acc.test.mean=0.9500000,acc.train.sd= NA; time: 0.0 min
## [Tune-x] 7: size=2; maxit=192; decay=0.339
## # weights: 21
## initial value 93.481108
## iter 10 value 73.030253
## iter 20 value 69.268595
## final value 69.235349
## converged
## [Tune-y] 7: mmce.test.mean=0.0750000,mmce.train.sd= NA,acc.test.mean=0.9250000,acc.train.sd= NA; time: 0.0 min
## [Tune-x] 8: size=1; maxit=126; decay=0.082
## # weights: 12
## initial value 98.881194
## iter 10 value 71.943449
## iter 20 value 45.171255
## iter 30 value 45.096613
## final value 45.096453
## converged
## [Tune-y] 8: mmce.test.mean=0.0500000,mmce.train.sd= NA,acc.test.mean=0.9500000,acc.train.sd= NA; time: 0.0 min
## [Tune-x] 9: size=5; maxit=133; decay=0.804
## # weights: 48
## initial value 111.225861
## iter 10 value 84.368740
## iter 20 value 83.610365
## iter 30 value 83.570240
## final value 83.570131
## converged
## [Tune-y] 9: mmce.test.mean=0.3000000,mmce.train.sd= NA,acc.test.mean=0.7000000,acc.train.sd= NA; time: 0.0 min
## [Tune-x] 10: size=5; maxit=222; decay=0.87
## # weights: 48
## initial value 126.396017
## iter 10 value 85.692185
## iter 20 value 84.765681
## iter 30 value 84.749695
## final value 84.749682
## converged
## [Tune-y] 10: mmce.test.mean=0.3000000,mmce.train.sd= NA,acc.test.mean=0.7000000,acc.train.sd= NA; time: 0.0 min
## [Tune] Result: size=2; maxit=152; decay=0.0457 : mmce.test.mean=0.0500000,mmce.train.sd= NA,acc.test.mean=0.9500000,acc.train.sd= NA
task <- makeClassifTask(data = knowledge_train_data, target = "UNS")
## Warning in makeTask(type = type, data = data, weights = weights, blocking
## = blocking, : Provided data is not a pure data.frame but from class tbl_df,
## hence it will be converted.
lrn <- makeLearner(cl = "classif.nnet", fix.factors.prediction = TRUE)
# Set hyperparameters
lrn_best <- setHyperPars(lrn, par.vals = list(size=1, maxit = 150, decay = 0))
# Train model
model_best <- train(lrn_best, task)
## # weights: 12
## initial value 132.424063
## iter 10 value 73.390690
## iter 20 value 23.114640
## iter 30 value 13.856769
## iter 40 value 13.691907
## iter 50 value 13.246583
## iter 60 value 13.245951
## iter 70 value 13.236257
## iter 80 value 13.106557
## iter 90 value 13.105081
## iter 100 value 13.104867
## iter 110 value 13.099957
## iter 120 value 13.099780
## iter 130 value 13.095653
## iter 140 value 13.094127
## iter 150 value 13.094025
## final value 13.094025
## stopped after 150 iterations
Chapter 4 - Hyperparameter Tuning with h2o
Machine Learning with h2o:
Grid and random search with h2o:
Automatic machine learning with h2o:
Wrap up:
Example code includes:
# Code runs OK in Console, does not run in knitr
library(mlr)
vecSeed <- c(15.26, 14.29, 13.84, 16.14, 14.38, 14.69, 15.26, 13.89, 13.78, 13.74, 14.59, 13.99, 15.69, 14.7, 12.72, 14.11, 15.01, 13.02, 14.11, 13.45, 13.16, 15.49, 14.09, 13.94, 15.05, 17.08, 14.8, 13.5, 13.16, 15.5, 13.8, 15.36, 14.99, 14.43, 15.78, 17.63, 16.84, 19.11, 16.82, 16.77, 20.71, 17.12, 18.72, 20.2, 19.57, 19.51, 18.88, 18.98, 20.88, 18.81, 18.59, 18.36, 16.87, 18.17, 18.72, 19.46, 19.18, 18.95, 18.83, 17.63, 18.55, 18.45, 19.38, 19.13, 19.14, 20.97, 19.06, 18.96, 19.15, 20.24, 13.07, 13.34, 12.22, 11.82, 11.21, 11.43, 12.49, 10.79, 11.83, 12.01, 12.26, 11.18, 11.19, 11.34, 11.75, 11.49, 12.54, 12.02, 12.05, 12.55, 11.14, 12.1, 12.15, 10.8, 11.26, 11.41, 12.46, 12.19, 11.65, 11.56, 11.81, 10.91, 11.23, 11.27, 11.87, 14.84, 14.09, 13.94, 14.99, 14.21, 14.49, 14.85, 14.02, 14.06, 14.05, 14.28, 13.83, 14.75, 14.21, 13.57, 14.26, 14.76, 13.76, 14.18, 14.02, 13.82, 14.94, 14.41, 14.17, 14.68, 15.38, 14.52, 13.85, 13.55, 14.86, 14.04, 14.76, 14.56, 14.4, 14.91, 15.98, 15.67, 16.26, 15.51, 15.62, 17.23, 15.55, 16.19, 16.89, 16.74, 16.71, 16.26, 16.66, 17.05, 16.29, 16.05, 16.52, 15.65, 16.26, 16.34, 16.5, 16.63, 16.42, 16.29, 15.86, 16.22, 16.12, 16.72, 16.31, 16.61, 17.25, 16.45, 16.2, 16.45, 16.91, 13.92, 13.95, 13.32, 13.4, 13.13, 13.13, 13.46, 12.93, 13.23, 13.52, 13.6, 13.04, 13.05, 12.87, 13.52, 13.22, 13.67, 13.33, 13.41, 13.57, 12.79, 13.15, 13.45, 12.57, 13.01, 12.95, 13.41, 13.36, 13.07, 13.31, 13.45, 12.8, 12.82, 12.86, 13.02, 0.87, 0.9, 0.9, 0.9, 0.9, 0.88, 0.87, 0.89, 0.88, 0.87, 0.9, 0.92, 0.91, 0.92, 0.87, 0.87, 0.87, 0.86, 0.88, 0.86, 0.87, 0.87, 0.85, 0.87, 0.88, 0.91, 0.88, 0.89, 0.9, 0.88, 0.88, 0.89, 0.89, 0.88, 0.89, 0.87, 0.86, 0.91, 0.88, 0.86, 0.88, 0.89, 0.9, 0.89, 0.88, 0.88, 0.9, 0.86, 0.9, 0.89, 0.91, 0.85, 0.86, 0.86, 0.88, 0.9, 0.87, 0.88, 0.89, 0.88, 0.89, 0.89, 0.87, 0.9, 0.87, 0.89, 0.89, 0.91, 0.89, 0.89, 0.85, 0.86, 0.87, 0.83, 0.82, 0.83, 0.87, 0.81, 0.85, 0.82, 0.83, 0.83, 0.83, 0.86, 0.81, 0.83, 0.84, 0.85, 0.84, 0.86, 0.86, 0.88, 0.84, 0.86, 0.84, 0.86, 0.87, 0.86, 0.86, 0.82, 0.82, 0.84, 0.86, 0.86, 0.88, 5.76, 5.29, 5.32, 5.66, 5.39, 5.56, 5.71, 5.44, 5.48, 5.48, 5.35, 5.12, 5.53, 5.21, 5.23, 5.52, 5.79, 5.39, 5.54, 5.52, 5.45, 5.76, 5.72, 5.58, 5.71, 5.83, 5.66, 5.35, 5.14, 5.88, 5.38, 5.7, 5.57, 5.58, 5.67, 6.19, 6, 6.15, 6.02, 5.93, 6.58, 5.85, 6.01, 6.29, 6.38, 6.37, 6.08, 6.55, 6.45, 6.27, 6.04, 6.67, 6.14, 6.27, 6.22, 6.11, 6.37, 6.25, 6.04, 6.03, 6.15, 6.11, 6.3, 6.18, 6.26, 6.56, 6.42, 6.05, 6.25, 6.32, 5.47, 5.39, 5.22, 5.31, 5.28, 5.18, 5.27, 5.32, 5.26, 5.41, 5.41, 5.22, 5.25, 5.05, 5.44, 5.3, 5.45, 5.35, 5.27, 5.33, 5.01, 5.11, 5.42, 4.98, 5.19, 5.09, 5.24, 5.24, 5.11, 5.36, 5.41, 5.09, 5.09, 5.09, 5.13, 3.31, 3.34, 3.38, 3.56, 3.31, 3.26, 3.24, 3.2, 3.16, 3.11, 3.33, 3.38, 3.51, 3.47, 3.05, 3.17, 3.25, 3.03, 3.22, 3.06, 2.98, 3.37, 3.19, 3.15, 3.33)
vecSeed <- c(vecSeed, 3.68, 3.29, 3.16, 3.2, 3.4, 3.15, 3.39, 3.38, 3.27, 3.43, 3.56, 3.48, 3.93, 3.49, 3.44, 3.81, 3.57, 3.86, 3.86, 3.77, 3.8, 3.76, 3.67, 4.03, 3.69, 3.86, 3.48, 3.46, 3.51, 3.68, 3.89, 3.68, 3.75, 3.79, 3.57, 3.67, 3.77, 3.79, 3.9, 3.74, 3.99, 3.72, 3.9, 3.82, 3.96, 2.99, 3.07, 2.97, 2.78, 2.69, 2.72, 2.97, 2.65, 2.84, 2.78, 2.83, 2.69, 2.67, 2.85, 2.68, 2.69, 2.88, 2.81, 2.85, 2.97, 2.79, 2.94, 2.84, 2.82, 2.71, 2.77, 3.02, 2.91, 2.85, 2.68, 2.72, 2.67, 2.82, 2.8, 2.95, 2.22, 2.7, 2.26, 1.36, 2.46, 3.59, 4.54, 3.99, 3.14, 2.93, 4.18, 5.23, 1.6, 1.77, 4.1, 2.69, 1.79, 3.37, 2.75, 3.53, 0.86, 3.41, 3.92, 2.12, 2.13, 2.96, 3.11, 2.25, 2.46, 4.71, 1.56, 1.37, 2.96, 3.98, 5.59, 4.08, 4.67, 2.94, 4, 4.92, 4.45, 2.86, 5.32, 5.17, 1.47, 2.96, 1.65, 3.69, 5.02, 3.24, 6, 4.93, 3.7, 2.85, 2.19, 4.31, 3.36, 3.37, 2.55, 3.75, 1.74, 2.23, 3.68, 2.11, 6.68, 4.68, 2.25, 4.33, 3.08, 5.9, 5.3, 6, 5.47, 4.47, 6.17, 2.22, 4.42, 5.46, 5.2, 6.99, 4.76, 3.33, 5.81, 3.35, 4.38, 5.39, 3.08, 4.27, 4.99, 4.42, 6.39, 2.2, 3.64, 4.77, 5.34, 4.96, 4.99, 4.86, 5.21, 4.06, 4.9, 4.18, 7.52, 3.98, 3.6, 5.22, 4.83, 4.8, 5.17, 4.96, 5.22, 5.31, 4.74, 4.87, 4.83, 4.78, 4.78, 5.05, 4.65, 4.91, 5.22, 5, 4.83, 5.04, 5.1, 5.06, 5.23, 5.3, 5.01, 5.36, 5.48, 5.31, 5.18, 4.78, 5.53, 4.96, 5.13, 5.17, 5.14, 5.14, 6.06, 5.88, 6.08, 5.84, 5.8, 6.45, 5.75, 5.88, 6.19, 6.27, 6.18, 6.11, 6.5, 6.32, 6.05, 5.88, 6.45, 5.97, 6.27, 6.1, 6.01, 6.23, 6.15, 5.88, 5.93, 5.89, 5.79, 5.96, 5.92, 6.05, 6.32, 6.16, 5.75, 6.18, 6.19, 5.39, 5.31, 5.22, 5.18, 5.28, 5.13, 5, 5.19, 5.31, 5.27, 5.36, 5, 5.22, 5, 5.31, 5.31, 5.49, 5.31, 5.05, 5.18, 5.05, 5.06, 5.34, 5.06, 5.09, 4.83, 5.15, 5.16, 5.13, 5.18, 5.35, 4.96, 4.96, 5, 5.13, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3)
seeds_train_data <- as.data.frame(matrix(vecSeed, nrow=105, byrow=FALSE))
names(seeds_train_data) <- c('area', 'perimeter', 'compactness', 'kernel_length', 'kernel_width', 'asymmetry', 'kernel_groove', 'seed_type')
seeds_train_data$seed_type <- as.factor(seeds_train_data$seed_type)
glimpse(seeds_train_data)
# Initialise h2o cluster
h2o::h2o.init()
# Convert data to h2o frame
seeds_train_data_hf <- h2o::as.h2o(seeds_train_data)
# Identify target and features
y <- "seed_type"
x <- setdiff(colnames(seeds_train_data_hf), y)
# Split data into train & validation sets
sframe <- h2o::h2o.splitFrame(seeds_train_data_hf, seed = 42)
train <- sframe[[1]]
valid <- sframe[[2]]
# Calculate ratio of the target variable in the training set
summary(seeds_train_data_hf$seed_type, exact_quantiles = TRUE)
# Train random forest model
rf_model <- h2o::h2o.randomForest(x = x, y = y, training_frame = train, validation_frame = valid)
# Calculate model performance
perf <- h2o::h2o.performance(rf_model, valid = TRUE)
# Extract confusion matrix
h2o::h2o.confusionMatrix(perf)
# Extract logloss
h2o::h2o.logloss(perf)
# Define hyperparameters
dl_params <- list(hidden = list(c(50, 50), c(100, 100)), epochs = c(5, 10, 15),
rate = c(0.001, 0.005, 0.01)
)
# Define search criteria
search_criteria <- list(strategy = "RandomDiscrete",
max_runtime_secs = 10, # this is way too short & only used to keep runtime short!
seed = 42
)
# Train with random search
dl_grid <- h2o::h2o.grid("deeplearning", grid_id = "dl_grid", x = x, y = y,
training_frame = train, validation_frame = valid, seed = 42,
hyper_params = dl_params, search_criteria = search_criteria
)
# Define early stopping
stopping_params <- list(strategy = "RandomDiscrete", stopping_metric = "misclassification",
stopping_rounds = 2, stopping_tolerance = 0.1, seed = 42
)
# Run automatic machine learning
automl_model <- h2o::h2o.automl(x = x, y = y, training_frame = train, max_runtime_secs = 10,
sort_metric = "mean_per_class_error", leaderboard_frame = valid, seed = 42
)
# Extract the leaderboard
lb <- automl_model@leaderboard
head(lb)
# Assign best model new object name
aml_leader <- automl_model@leader
# Look at best model
summary(aml_leader)
Chapter 1 - Programming with purrr
Refresher of purrr Basics:
Introduction to mappers:
Using Mappers to Clean Data:
Predicates:
Example code includes:
# Create the to_day function
to_day <- function(x) {
x*24
}
visit_a <- c(117, 147, 131, 73, 81, 134, 121)
visit_b <- c(180, 193, 116, 166, 131, 153, 146)
visit_c <- c(57, 110, 68, 72, 87, 141, 67)
# Create a list containing both vectors: all_visits
all_visits <- list(visit_a, visit_b)
# Convert to daily number of visits: all_visits_day
all_visits_day <- map(all_visits, to_day)
# Map the mean() function and output a numeric vector
map_dbl(all_visits_day, mean)
## [1] 2756.571 3720.000
# You'll test out both map() and walk() for plotting
# Both return the "side effects," that is to say, the changes in the environment (drawing plots, downloading a file, changing the working directory...), but walk() won't print anything to the console.
# Create all_tests list and modify with to_day() function
all_tests <- list(visit_a, visit_b, visit_c)
all_tests_day <- map(all_tests, to_day)
# Plot all_tests_day with map
map(all_tests_day, barplot)
## [[1]]
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
## [5,] 5.5
## [6,] 6.7
## [7,] 7.9
##
## [[2]]
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
## [5,] 5.5
## [6,] 6.7
## [7,] 7.9
##
## [[3]]
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
## [5,] 5.5
## [6,] 6.7
## [7,] 7.9
# Plot all_tests_day
walk(all_tests_day, barplot)
# Get sum of all visits and class of sum_all
sum_all <- pmap(all_tests_day, sum)
class(sum_all)
## [1] "list"
# Turn visit_a into daily number using an anonymous function
map(visit_a, function(x) {
x*24
})
## [[1]]
## [1] 2808
##
## [[2]]
## [1] 3528
##
## [[3]]
## [1] 3144
##
## [[4]]
## [1] 1752
##
## [[5]]
## [1] 1944
##
## [[6]]
## [1] 3216
##
## [[7]]
## [1] 2904
# Turn visit_a into daily number of visits by using a mapper
map(visit_a, ~.x*24)
## [[1]]
## [1] 2808
##
## [[2]]
## [1] 3528
##
## [[3]]
## [1] 3144
##
## [[4]]
## [1] 1752
##
## [[5]]
## [1] 1944
##
## [[6]]
## [1] 3216
##
## [[7]]
## [1] 2904
# Create a mapper object called to_day
to_day <- as_mapper(~.x*24)
# Use it on the three vectors
map(visit_a, to_day)
## [[1]]
## [1] 2808
##
## [[2]]
## [1] 3528
##
## [[3]]
## [1] 3144
##
## [[4]]
## [1] 1752
##
## [[5]]
## [1] 1944
##
## [[6]]
## [1] 3216
##
## [[7]]
## [1] 2904
map(visit_b, to_day)
## [[1]]
## [1] 4320
##
## [[2]]
## [1] 4632
##
## [[3]]
## [1] 2784
##
## [[4]]
## [1] 3984
##
## [[5]]
## [1] 3144
##
## [[6]]
## [1] 3672
##
## [[7]]
## [1] 3504
map(visit_c, to_day)
## [[1]]
## [1] 1368
##
## [[2]]
## [1] 2640
##
## [[3]]
## [1] 1632
##
## [[4]]
## [1] 1728
##
## [[5]]
## [1] 2088
##
## [[6]]
## [1] 3384
##
## [[7]]
## [1] 1608
# Round visit_a to the nearest tenth with a mapper
map_dbl(visit_a, ~ round(.x, -1))
## [1] 120 150 130 70 80 130 120
# Create to_ten, a mapper that rounds to the nearest tenth
to_ten <- as_mapper(~ round(.x, -1))
# Map to_ten on visit_b
map_dbl(visit_b, to_ten)
## [1] 180 190 120 170 130 150 150
# Map to_ten on visit_c
map_dbl(visit_c, to_ten)
## [1] 60 110 70 70 90 140 70
# Create a mapper that test if .x is more than 100
is_more_than_hundred <- as_mapper(~ .x > 100)
# Run this mapper on the all_visits object
all_visits <- list(visit_a, visit_b, visit_c)
map(all_visits, ~ keep(.x, is_more_than_hundred) )
## [[1]]
## [1] 117 147 131 134 121
##
## [[2]]
## [1] 180 193 116 166 131 153 146
##
## [[3]]
## [1] 110 141
# Use the day vector to set names to all_list
day <- c("mon", "tue", "wed", "thu", "fri", "sat", "sun")
full_visits_named <- map(all_visits, ~ set_names(.x, day))
# Use this mapper with keep()
map(full_visits_named, ~ keep(.x, is_more_than_hundred))
## [[1]]
## mon tue wed sat sun
## 117 147 131 134 121
##
## [[2]]
## mon tue wed thu fri sat sun
## 180 193 116 166 131 153 146
##
## [[3]]
## tue sat
## 110 141
# Set the name of each subvector
day <- c("mon", "tue", "wed", "thu", "fri", "sat", "sun")
all_visits_named <- map(all_visits, ~ set_names(.x, day))
# Create a mapper that will test if .x is over 100
threshold <- as_mapper(~.x > 100)
# Run this mapper on the all_visits object: group_over
group_over <- map(all_visits, ~ keep(.x, threshold) )
# Run this mapper on the all_visits object: group_under
group_under <- map(all_visits, ~ discard(.x, threshold) )
# Create a threshold variable, set it to 160
threshold <- 160
# Create a mapper that tests if .x is over the defined threshold
over_threshold <- as_mapper(~ .x > threshold)
# Are all elements in every all_visits vectors over the defined threshold?
map(all_visits, ~ every(.x, over_threshold))
## [[1]]
## [1] FALSE
##
## [[2]]
## [1] FALSE
##
## [[3]]
## [1] FALSE
# Are some elements in every all_visits vectors over the defined threshold?
map(all_visits, ~ some(.x, over_threshold))
## [[1]]
## [1] FALSE
##
## [[2]]
## [1] TRUE
##
## [[3]]
## [1] FALSE
Chapter 2 - Functional Programming from Theory to Practice
Functional Programming in R:
Tools for Functional Programming in purrr:
function(...){fun(..., na.rm = TRUE)} Using possibly():
Handling adverb results:
Example code includes:
# `$` is a function call, of a special type called 'infix operator', as they are put between two elements, and can be used without parenthesis.
# Launch Sys.time(), Sys.sleep(1), & Sys.time()
Sys.time()
## [1] "2019-06-10 08:40:53 CDT"
Sys.sleep(1)
Sys.time()
## [1] "2019-06-10 08:40:54 CDT"
# Launch nrow(iris), Sys.sleep(1), & nrow(iris)
data(iris)
nrow(iris)
## [1] 150
Sys.sleep(1)
nrow(iris)
## [1] 150
# Launch ls(), create an object, then rerun the ls() function
# ls()
# this <- 12
# ls()
# Create a plot of the iris dataset
plot(iris)
urls <- c('https://thinkr.fr', 'https://colinfay.me', 'http://not_working.org', 'https://datacamp.com', 'http://cran.r-project.org/', 'https://not_working_either.org')
# Create a safe version of read_lines()
safe_read <- safely(read_lines)
# Map it on the urls vector
res <- map(urls, safe_read)
# Set the name of the results to `urls`
named_res <- set_names(res, urls)
# Extract only the "error" part of each sublist
map(named_res, "error")
## $`https://thinkr.fr`
## NULL
##
## $`https://colinfay.me`
## NULL
##
## $`http://not_working.org`
## <simpleError in open.connection(con, "rb"): Could not resolve host: not_working.org>
##
## $`https://datacamp.com`
## NULL
##
## $`http://cran.r-project.org/`
## NULL
##
## $`https://not_working_either.org`
## <simpleError in open.connection(con, "rb"): Could not resolve host: not_working_either.org>
# Code a function that discard() the NULL from safe_read()
safe_read_discard <- function(url){
safe_read(url) %>%
discard(is.null)
}
# Map this function on the url list
res <- map(urls, safe_read_discard)
# Create a possibly() version of read_lines()
possible_read <- possibly(read_lines, otherwise = 404)
# Map this function on urls, pipe it into set_names()
res <- map(urls, possible_read) %>% set_names(urls)
# Paste each element of the list
res_pasted <- map(res, paste, collapse=" ")
# Keep only the elements which are equal to 404
keep(res_pasted, ~ .x == 404)
## $`http://not_working.org`
## [1] "404"
##
## $`https://not_working_either.org`
## [1] "404"
url_tester <- function(url_list){
url_list %>%
# Map a version of read_lines() that otherwise returns 404
map( possibly(read_lines, otherwise = 404) ) %>%
# Set the names of the result
set_names( urls ) %>%
# paste() and collapse each element
map(paste, collapse =" ") %>%
# Remove the 404
discard(~.x==404) %>%
names() # Will return the names of the good ones
}
# Try this function on the urls object
url_tester(urls)
## [1] "https://thinkr.fr" "https://colinfay.me"
## [3] "https://datacamp.com" "http://cran.r-project.org/"
url_tester <- function(url_list, type = c("result", "error")){
res <- url_list %>%
# Create a safely() version of read_lines()
map( safely(read_lines) ) %>%
set_names( url_list ) %>%
# Transpose into a list of $result and $error
purrr::transpose()
# Complete this if statement
if (type == "result") return( res$result )
if (type == "error") return( res$error )
}
# Try this function on the urls object
url_tester(urls, type = "error")
## $`https://thinkr.fr`
## NULL
##
## $`https://colinfay.me`
## NULL
##
## $`http://not_working.org`
## <simpleError in open.connection(con, "rb"): Could not resolve host: not_working.org>
##
## $`https://datacamp.com`
## NULL
##
## $`http://cran.r-project.org/`
## NULL
##
## $`https://not_working_either.org`
## <simpleError in open.connection(con, "rb"): Could not resolve host: not_working_either.org>
url_tester <- function(url_list){
url_list %>%
# Map a version of GET() that would otherwise return NULL
map( possibly(httr::GET, otherwise=NULL) ) %>%
# Set the names of the result
set_names( urls ) %>%
# Remove the NULL
compact() %>%
# Extract all the "status_code" elements
map("status_code")
}
# Try this function on the urls object
url_tester(urls)
## $`https://thinkr.fr`
## [1] 200
##
## $`https://colinfay.me`
## [1] 200
##
## $`https://datacamp.com`
## [1] 200
##
## $`http://cran.r-project.org/`
## [1] 200
Chapter 3 - Better Code with purrr
Rationale for cleaner code:
Building functions with compose() and negate():
Prefilling functions:
List columns:
Example code includes:
urls <- c('https://thinkr.fr', 'https://colinfay.me', 'https://datacamp.com', 'http://cran.r-project.org/')
# Compose a status extractor
status_extract <- purrr::compose(httr::status_code, httr::GET)
# Try with "https://thinkr.fr" & "http://datacamp.com"
status_extract("https://thinkr.fr")
## [1] 200
status_extract("http://datacamp.com")
## [1] 200
# Map it on the urls vector, return a vector of numbers
map_dbl(urls, status_extract)
## [1] 200 200 200 200
# Negate the %in% function
`%not_in%` <- negate(`%in%`)
# Complete the function
strict_code <- function(url){
code <- status_extract(url)
if (code %not_in% c(200:203)){
return(NA)
} else {
return(code)
}
}
# Map the strict_code function on the urls vector
res <- map(urls, strict_code)
# Set the names of the results using the urls vector
res_named <- set_names(res, urls)
# Negate the is.na function
is_not_na <- negate(is.na)
# Run is_not_na on the results
is_not_na(res_named)
## https://thinkr.fr https://colinfay.me
## TRUE TRUE
## https://datacamp.com http://cran.r-project.org/
## TRUE TRUE
# Prefill html_nodes() with the css param set to h2
get_h2 <- partial(rvest::html_nodes, css="h2")
# Combine the html_text, get_h2 and read_html functions
get_content <- purrr::compose(rvest::html_text, get_h2, xml2::read_html)
# Map get_content to the urls list
res <- map(urls, get_content) %>%
set_names(urls)
# Print the results to the console
res
## $`https://thinkr.fr`
## [1] "Conseil, développement et formation au logiciel R"
## [2] "Formez-vous au logiciel R !"
## [3] "\r\n\t\tRstudio & ThinkR roadshow – le 6 juin à Paris\r\n\t"
## [4] "\r\n\t\tBilan pédagogique et financier 2018\r\n\t"
## [5] "\r\n\t\tPédagogie de la formation au langage R\r\n\t"
## [6] "\r\n\t\tRetour sur les projets R des étudiants du MSc X-HEC Data Science for Business\r\n\t"
## [7] "\r\n\t\tConstruisons la certification R du RConsortium\r\n\t"
## [8] "\r\n\t\tLes tests statistiques\r\n\t"
## [9] "\r\n\t\tÀ la découverte de RStudio Package Manager\r\n\t"
## [10] "\r\n\t\tLes pièges de la représentation de données\r\n\t"
## [11] "\r\n\t\tDBI : Distributeur des Brasseurs Indépendants ? Non DataBase Interface\r\n\t"
##
## $`https://colinfay.me`
## character(0)
##
## $`https://datacamp.com`
## character(0)
##
## $`http://cran.r-project.org/`
## character(0)
# Create a partial version of html_nodes(), with the css param set to "a"
a_node <- partial(rvest::html_nodes, css="a")
# Create href(), a partial version of html_attr()
href <- partial(rvest::html_attr, name = "href")
# Combine href(), a_node(), and read_html()
get_links <- purrr::compose(href, a_node, xml2::read_html)
# Map get_links() to the urls list
res <- map(urls, get_links) %>%
set_names(urls)
# Create a "links" columns, by mapping get_links() on urls
df2 <- tibble::tibble(urls=urls) %>%
mutate(links = map(urls, get_links))
# Print df2 to see what it looks like
df2
## # A tibble: 4 x 2
## urls links
## <chr> <list>
## 1 https://thinkr.fr <chr [145]>
## 2 https://colinfay.me <chr [47]>
## 3 https://datacamp.com <chr [50]>
## 4 http://cran.r-project.org/ <chr [1]>
# unnest() df2 to have a tidy dataframe
df2 %>%
unnest()
## # A tibble: 243 x 2
## urls links
## <chr> <chr>
## 1 https://thinkr~ https://thinkr.fr/
## 2 https://thinkr~ https://thinkr.fr/
## 3 https://thinkr~ https://thinkr.fr/formation-au-logiciel-r/
## 4 https://thinkr~ https://thinkr.fr/formation-au-logiciel-r/introduction-~
## 5 https://thinkr~ https://thinkr.fr/formation-au-logiciel-r/statistique-a~
## 6 https://thinkr~ https://thinkr.fr/formation-au-logiciel-r/programmation~
## 7 https://thinkr~ https://thinkr.fr/formation-au-logiciel-r/r-et-le-big-d~
## 8 https://thinkr~ https://thinkr.fr/formation-au-logiciel-r/r-pour-la-fin~
## 9 https://thinkr~ https://thinkr.fr/formation-au-logiciel-r/integration-d~
## 10 https://thinkr~ https://thinkr.fr/formation-au-logiciel-r/formation-sig~
## # ... with 233 more rows
Chapter 4 - Case Study
Discovering the Dataset:
Extracting Information from the Dataset:
Manipulating URL:
Identifying Influencers:
Wrap up:
Example code includes:
rstudioconfDF <- readRDS("./RInputFiles/#RStudioConf.RDS")
rstudioconfListA <- split(rstudioconfDF, seq(nrow(rstudioconfDF)))
rstudioconf <- lapply(rstudioconfListA, FUN=as.list)
# Print the first element of the list to the console
rstudioconf[[1]]
## $status_id
## [1] "960732355773239296"
##
## $created_at
## [1] "2018-02-06 04:30:17 UTC"
##
## $user_id
## [1] "626266741"
##
## $screen_name
## [1] "grod_rf"
##
## $text
## [1] "RT @dataandme: <U+0001F41C> Check it, @ajmcoqui's \"Debugging in RStudio\" \n<U+0001F4FD> Slides *and* cheat sheet!\nhttps://t.co/rAvKP9iXLa #rstats #rstudioconf htt…"
##
## $source
## [1] "Twitter for Android"
##
## $reply_to_status_id
## [1] NA
##
## $reply_to_user_id
## [1] NA
##
## $reply_to_screen_name
## [1] NA
##
## $is_quote
## [1] FALSE
##
## $is_retweet
## [1] TRUE
##
## $favorite_count
## [1] 0
##
## $retweet_count
## [1] 7
##
## $hashtags
## $hashtags[[1]]
## [1] "rstats" "rstudioconf"
##
##
## $symbols
## $symbols[[1]]
## [1] NA
##
##
## $urls_url
## $urls_url[[1]]
## [1] "buff.ly/2s7W8ED"
##
##
## $urls_t.co
## $urls_t.co[[1]]
## [1] "https://t.co/rAvKP9iXLa"
##
##
## $urls_expanded_url
## $urls_expanded_url[[1]]
## [1] "https://buff.ly/2s7W8ED"
##
##
## $media_url
## $media_url[[1]]
## [1] NA
##
##
## $media_t.co
## $media_t.co[[1]]
## [1] NA
##
##
## $media_expanded_url
## $media_expanded_url[[1]]
## [1] NA
##
##
## $media_type
## $media_type[[1]]
## [1] NA
##
##
## $ext_media_url
## $ext_media_url[[1]]
## [1] NA
##
##
## $ext_media_t.co
## $ext_media_t.co[[1]]
## [1] NA
##
##
## $ext_media_expanded_url
## $ext_media_expanded_url[[1]]
## [1] NA
##
##
## $ext_media_type
## [1] NA
##
## $mentions_user_id
## $mentions_user_id[[1]]
## [1] "3230388598" "732925397814247426"
##
##
## $mentions_screen_name
## $mentions_screen_name[[1]]
## [1] "dataandme" "ajmcoqui"
##
##
## $lang
## [1] "en"
##
## $quoted_status_id
## [1] NA
##
## $quoted_text
## [1] NA
##
## $retweet_status_id
## [1] "960600422556880896"
##
## $retweet_text
## [1] "<U+0001F41C> Check it, @ajmcoqui's \"Debugging in RStudio\" \n<U+0001F4FD> Slides *and* cheat sheet!\nhttps://t.co/rAvKP9iXLa #rstats #rstudioconf https://t.co/T4627GcuXK"
##
## $place_url
## [1] NA
##
## $place_name
## [1] NA
##
## $place_full_name
## [1] NA
##
## $place_type
## [1] NA
##
## $country
## [1] NA
##
## $country_code
## [1] NA
##
## $geo_coords
## $geo_coords[[1]]
## [1] NA NA
##
##
## $coords_coords
## $coords_coords[[1]]
## [1] NA NA
##
##
## $bbox_coords
## $bbox_coords[[1]]
## [1] NA NA NA NA NA NA NA NA
# Create a sublist of non-retweets
non_rt <- discard(rstudioconf, "is_retweet")
# Extract the favorite count element of each non_rt sublist
fav_count <- map_dbl(non_rt, "favorite_count")
# Get the median of favorite_count for non_rt
median(fav_count)
## [1] 1
# Keep the RT, extract the user_id, remove the duplicate
rt <- keep(rstudioconf, "is_retweet") %>%
map("user_id") %>%
unique()
# Remove the RT, extract the user id, remove the duplicate
non_rt <- discard(rstudioconf, "is_retweet") %>%
map("user_id") %>%
unique()
# Determine the total number of users
union(rt, non_rt) %>%
length()
## [1] 1742
# Determine the number of users who has just retweeted
setdiff(rt, non_rt) %>%
length()
## [1] 1302
# Prefill mean() with na.rm, and round() with digits = 1
mean_na_rm <- partial(mean, na.rm=TRUE)
round_one <- partial(round, digits=1)
# Compose a rounded_mean function
rounded_mean <- purrr::compose(round_one, mean_na_rm)
# Extract the non retweet
non_rt <- discard(rstudioconf, "is_retweet")
# Extract "favorite_count", and pass it to rounded_mean()
map_dbl(non_rt, "favorite_count") %>%
rounded_mean()
## [1] 3.3
# Combine as_vector(), compact(), and flatten()
flatten_to_vector <- purrr::compose(as_vector, compact, flatten)
# Complete the fonction
extractor <- function(list, what = "mentions_screen_name"){
map(list, what) %>%
flatten_to_vector()
}
# Create six_most, with tail(), sort(), and table()
six_most <- purrr::compose(tail, sort, table)
# Run extractor() on rstudioconf
extractor(rstudioconf) %>%
six_most()
## .
## JennyBryan hadleywickham AmeliaMN juliasilge drob
## 278 308 362 376 418
## rstudio
## 648
# Extract the "urls_url" elements, and flatten() the result
urls_clean <- map(rstudioconf, "urls_url") %>%
lapply(FUN=function(x) { ifelse(is.na(x), list(NULL), x) }) %>%
flatten()
# Remove the NULL
compact_urls <- compact(urls_clean)
# Create a mapper that detects the patten "github"
has_github <- as_mapper(~ str_detect(.x[1], "github"))
# Look for the "github" pattern, and sum the result
map_lgl( compact_urls, has_github ) %>%
sum()
## [1] 346
# Complete the function
ratio_pattern <- function(vec, pattern){
n_pattern <- str_detect(vec, pattern) %>%
sum()
n_pattern / length(vec)
}
# Create flatten_and_compact()
flatten_and_compact <- purrr::compose(compact, flatten)
# Complete the pipe to get the ratio of URLs with "github"
map(rstudioconf, "urls_url") %>%
lapply(FUN=function(x) { ifelse(is.na(x), list(NULL), x) }) %>%
flatten_and_compact() %>%
ratio_pattern("github")
## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing
## [1] 0.3033217
# Create mean_above, a mapper that tests if .x is over 3.3
mean_above <- as_mapper(~ . > 3.3)
# Prefil map_at() with "retweet_count", mean_above for above,
# and mean_above negation for below
above <- partial(map_at, .at = "retweet_count", .f = mean_above )
below <- partial(map_at, .at = "retweet_count", .f = negate(mean_above) )
# Map above() and below() on non_rt, keep the "retweet_count"
# ab <- map(non_rt, above) %>% keep("retweet_count")
# bl <- map(non_rt, below) %>% keep("retweet_count")
rtCounts <- sapply(map(non_rt, "retweet_count"), FUN=function(x) { x })
ab <- rtCounts[rtCounts > 3.3]
bl <- rtCounts[rtCounts <= 3.3]
# Compare the size of both elements
length(ab)
## [1] 83
length(bl)
## [1] 1741
# Get the max() of "retweet_count"
max_rt <- map_dbl(non_rt, "retweet_count") %>%
max()
# Prefill map_at() with a mapper testing if .x equal max_rt
# max_rt_calc <- partial(map_at, .at = "retweet_count", .f = ~.x==max_rt )
idxMatch <- which(map(non_rt, "retweet_count") == max_rt)
# Map max_rt_calc on non_rt, keep the retweet_count & flatten
# res <- map(non_rt, max_rt_calc) %>%
# keep("retweet_count") %>%
# flatten()
# Print the "screen_name" and "text" of the result
res <- non_rt[[idxMatch]]
res$screen_name
## [1] "kearneymw"
res$text
## [1] "The week of #rstudioconf is a good time to remind everyone that some important books are [intentionally] available online for free:\nhttps://t.co/ePMiKs3MAr\nhttps://t.co/NHR7wmLGgd\nhttps://t.co/wbymwjG0CD\nhttps://t.co/uwqG0q967M\nhttps://t.co/AjXTfZgyAg\nhttps://t.co/zgoHq51PGV"
Chapter 1 - Introduction to Longitudinal Data
Introduction to Longitudinal Data:
Data Restructuring and Correlations:
Descriptive Statistics:
Example code includes:
data(calcium, package="lava")
str(calcium)
## 'data.frame': 501 obs. of 6 variables:
## $ bmd : num 0.815 0.875 0.911 0.952 0.97 0.813 0.833 0.855 0.881 0.901 ...
## $ group : Factor w/ 2 levels "C","P": 1 1 1 1 1 2 2 2 2 2 ...
## $ person: int 101 101 101 101 101 102 102 102 102 102 ...
## $ visit : int 1 2 3 4 5 1 2 3 4 5 ...
## $ age : num 10.9 11.4 11.9 12.4 12.9 ...
## $ ctime : int 11078 11266 11436 11625 11807 11078 11266 11427 11616 11791 ...
# Individuals with data at each visit number
count(calcium, visit)
## # A tibble: 5 x 2
## visit n
## <int> <int>
## 1 1 112
## 2 2 105
## 3 3 99
## 4 4 94
## 5 5 91
# Individuals in each group
count(calcium, person)
## # A tibble: 112 x 2
## person n
## <int> <int>
## 1 101 5
## 2 102 5
## 3 103 5
## 4 104 5
## 5 105 5
## 6 106 5
## 7 107 5
## 8 108 2
## 9 109 5
## 10 110 5
## # ... with 102 more rows
# Individuals in each group
count(calcium, group)
## # A tibble: 2 x 2
## group n
## <fct> <int>
## 1 C 245
## 2 P 256
# Individuals with each visit number in each group
count(calcium, visit, group)
## # A tibble: 10 x 3
## visit group n
## <int> <fct> <int>
## 1 1 C 55
## 2 1 P 57
## 3 2 C 52
## 4 2 P 53
## 5 3 C 48
## 6 3 P 51
## 7 4 C 46
## 8 4 P 48
## 9 5 C 44
## 10 5 P 47
# Restructure data into wide format for correlations
calcium_wide <- calcium %>%
mutate(visit_char = paste0('visit_', visit)) %>%
select(bmd, person, visit_char) %>%
spread(visit_char, bmd)
# Calculate correlations across time
calcium_corr <- calcium_wide %>%
select(-person) %>%
corrr::correlate(method="pearson") %>%
corrr::shave(upper=FALSE) %>%
corrr::fashion(decimals=3)
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
# Convert data from wide to long format
calcium_wide %>%
gather(key="visit", value="bmd", -person)
## person visit bmd
## 1 101 visit_1 0.815
## 2 102 visit_1 0.813
## 3 103 visit_1 0.812
## 4 104 visit_1 0.804
## 5 105 visit_1 0.904
## 6 106 visit_1 0.831
## 7 107 visit_1 0.777
## 8 108 visit_1 0.792
## 9 109 visit_1 0.821
## 10 110 visit_1 0.823
## 11 111 visit_1 0.828
## 12 112 visit_1 0.797
## 13 113 visit_1 0.867
## 14 114 visit_1 0.795
## 15 115 visit_1 0.835
## 16 116 visit_1 0.870
## 17 117 visit_1 0.856
## 18 118 visit_1 0.762
## 19 119 visit_1 0.758
## 20 120 visit_1 0.800
## 21 121 visit_1 0.795
## 22 122 visit_1 0.874
## 23 123 visit_1 0.830
## 24 124 visit_1 0.815
## 25 125 visit_1 0.800
## 26 126 visit_1 0.787
## 27 127 visit_1 0.795
## 28 128 visit_1 0.746
## 29 129 visit_1 0.837
## 30 130 visit_1 0.847
## 31 131 visit_1 0.832
## 32 132 visit_1 0.784
## 33 133 visit_1 0.883
## 34 134 visit_1 0.785
## 35 135 visit_1 0.822
## 36 136 visit_1 0.811
## 37 137 visit_1 0.815
## 38 201 visit_1 0.840
## 39 202 visit_1 0.866
## 40 203 visit_1 0.905
## 41 204 visit_1 0.883
## 42 205 visit_1 0.881
## 43 206 visit_1 0.915
## 44 207 visit_1 0.913
## 45 208 visit_1 0.868
## 46 209 visit_1 0.901
## 47 210 visit_1 0.879
## 48 211 visit_1 0.876
## 49 212 visit_1 0.989
## 50 213 visit_1 0.930
## 51 214 visit_1 0.896
## 52 215 visit_1 0.871
## 53 301 visit_1 0.902
## 54 302 visit_1 0.865
## 55 303 visit_1 0.910
## 56 305 visit_1 0.894
## 57 306 visit_1 0.897
## 58 307 visit_1 0.921
## 59 308 visit_1 0.840
## 60 309 visit_1 0.889
## 61 310 visit_1 0.819
## 62 311 visit_1 0.840
## 63 312 visit_1 0.835
## 64 313 visit_1 0.933
## 65 314 visit_1 0.894
## 66 315 visit_1 0.825
## 67 316 visit_1 0.837
## 68 317 visit_1 0.871
## 69 318 visit_1 0.840
## 70 319 visit_1 0.909
## 71 320 visit_1 0.923
## 72 321 visit_1 0.874
## 73 322 visit_1 0.841
## 74 323 visit_1 0.871
## 75 324 visit_1 0.827
## 76 325 visit_1 0.811
## 77 326 visit_1 0.856
## 78 327 visit_1 0.842
## 79 328 visit_1 0.860
## 80 329 visit_1 0.998
## 81 330 visit_1 0.876
## 82 331 visit_1 0.971
## 83 401 visit_1 1.028
## 84 402 visit_1 0.871
## 85 403 visit_1 0.981
## 86 404 visit_1 1.005
## 87 405 visit_1 1.012
## 88 406 visit_1 0.961
## 89 407 visit_1 0.948
## 90 408 visit_1 0.907
## 91 409 visit_1 0.936
## 92 410 visit_1 0.856
## 93 411 visit_1 0.970
## 94 412 visit_1 0.927
## 95 413 visit_1 0.921
## 96 414 visit_1 0.883
## 97 415 visit_1 0.955
## 98 416 visit_1 1.014
## 99 417 visit_1 0.938
## 100 418 visit_1 0.961
## 101 419 visit_1 0.879
## 102 420 visit_1 0.941
## 103 421 visit_1 0.945
## 104 422 visit_1 0.875
## 105 423 visit_1 0.861
## 106 424 visit_1 0.888
## 107 425 visit_1 0.928
## 108 426 visit_1 0.936
## 109 427 visit_1 0.859
## 110 428 visit_1 0.991
## 111 429 visit_1 0.971
## 112 430 visit_1 0.969
## 113 101 visit_2 0.875
## 114 102 visit_2 0.833
## 115 103 visit_2 0.812
## 116 104 visit_2 0.847
## 117 105 visit_2 0.927
## 118 106 visit_2 0.855
## 119 107 visit_2 0.803
## 120 108 visit_2 0.814
## 121 109 visit_2 0.850
## 122 110 visit_2 0.827
## 123 111 visit_2 0.873
## 124 112 visit_2 0.818
## 125 113 visit_2 0.873
## 126 114 visit_2 0.812
## 127 115 visit_2 0.849
## 128 116 visit_2 0.872
## 129 117 visit_2 NA
## 130 118 visit_2 0.769
## 131 119 visit_2 0.759
## 132 120 visit_2 0.824
## 133 121 visit_2 0.835
## 134 122 visit_2 0.902
## 135 123 visit_2 0.857
## 136 124 visit_2 0.829
## 137 125 visit_2 0.833
## 138 126 visit_2 0.792
## 139 127 visit_2 0.828
## 140 128 visit_2 0.748
## 141 129 visit_2 0.849
## 142 130 visit_2 0.829
## 143 131 visit_2 0.862
## 144 132 visit_2 0.785
## 145 133 visit_2 0.892
## 146 134 visit_2 0.778
## 147 135 visit_2 NA
## 148 136 visit_2 0.839
## 149 137 visit_2 0.799
## 150 201 visit_2 0.867
## 151 202 visit_2 0.924
## 152 203 visit_2 0.955
## 153 204 visit_2 0.916
## 154 205 visit_2 0.904
## 155 206 visit_2 0.940
## 156 207 visit_2 0.949
## 157 208 visit_2 0.868
## 158 209 visit_2 0.926
## 159 210 visit_2 0.873
## 160 211 visit_2 0.916
## 161 212 visit_2 1.011
## 162 213 visit_2 0.968
## 163 214 visit_2 0.907
## 164 215 visit_2 0.896
## 165 301 visit_2 0.941
## 166 302 visit_2 0.910
## 167 303 visit_2 0.937
## 168 305 visit_2 0.894
## 169 306 visit_2 NA
## 170 307 visit_2 0.953
## 171 308 visit_2 0.868
## 172 309 visit_2 0.920
## 173 310 visit_2 0.853
## 174 311 visit_2 0.874
## 175 312 visit_2 0.866
## 176 313 visit_2 0.923
## 177 314 visit_2 0.922
## 178 315 visit_2 0.867
## 179 316 visit_2 0.869
## 180 317 visit_2 0.875
## 181 318 visit_2 0.861
## 182 319 visit_2 0.929
## 183 320 visit_2 0.908
## 184 321 visit_2 NA
## 185 322 visit_2 0.853
## 186 323 visit_2 0.885
## 187 324 visit_2 0.823
## 188 325 visit_2 0.839
## 189 326 visit_2 0.876
## 190 327 visit_2 0.851
## 191 328 visit_2 0.870
## 192 329 visit_2 NA
## 193 330 visit_2 NA
## 194 331 visit_2 0.978
## 195 401 visit_2 NA
## 196 402 visit_2 0.904
## 197 403 visit_2 1.010
## 198 404 visit_2 1.049
## 199 405 visit_2 1.051
## 200 406 visit_2 0.981
## 201 407 visit_2 0.987
## 202 408 visit_2 0.930
## 203 409 visit_2 0.968
## 204 410 visit_2 0.902
## 205 411 visit_2 1.004
## 206 412 visit_2 0.944
## 207 413 visit_2 0.952
## 208 414 visit_2 0.934
## 209 415 visit_2 0.979
## 210 416 visit_2 1.055
## 211 417 visit_2 0.980
## 212 418 visit_2 0.977
## 213 419 visit_2 0.914
## 214 420 visit_2 0.967
## 215 421 visit_2 1.024
## 216 422 visit_2 0.892
## 217 423 visit_2 0.870
## 218 424 visit_2 0.903
## 219 425 visit_2 0.959
## 220 426 visit_2 0.942
## 221 427 visit_2 0.910
## 222 428 visit_2 1.037
## 223 429 visit_2 0.973
## 224 430 visit_2 1.011
## 225 101 visit_3 0.911
## 226 102 visit_3 0.855
## 227 103 visit_3 0.843
## 228 104 visit_3 0.885
## 229 105 visit_3 0.952
## 230 106 visit_3 0.890
## 231 107 visit_3 0.817
## 232 108 visit_3 NA
## 233 109 visit_3 0.865
## 234 110 visit_3 0.839
## 235 111 visit_3 0.935
## 236 112 visit_3 0.817
## 237 113 visit_3 0.893
## 238 114 visit_3 0.827
## 239 115 visit_3 0.860
## 240 116 visit_3 NA
## 241 117 visit_3 NA
## 242 118 visit_3 NA
## 243 119 visit_3 0.805
## 244 120 visit_3 0.859
## 245 121 visit_3 0.856
## 246 122 visit_3 0.922
## 247 123 visit_3 0.891
## 248 124 visit_3 0.852
## 249 125 visit_3 0.866
## 250 126 visit_3 0.830
## 251 127 visit_3 0.838
## 252 128 visit_3 0.756
## 253 129 visit_3 0.891
## 254 130 visit_3 0.862
## 255 131 visit_3 0.904
## 256 132 visit_3 0.816
## 257 133 visit_3 0.950
## 258 134 visit_3 0.792
## 259 135 visit_3 NA
## 260 136 visit_3 0.869
## 261 137 visit_3 0.810
## 262 201 visit_3 0.934
## 263 202 visit_3 0.954
## 264 203 visit_3 0.963
## 265 204 visit_3 0.924
## 266 205 visit_3 0.921
## 267 206 visit_3 0.945
## 268 207 visit_3 1.010
## 269 208 visit_3 0.923
## 270 209 visit_3 0.952
## 271 210 visit_3 0.892
## 272 211 visit_3 0.942
## 273 212 visit_3 1.053
## 274 213 visit_3 0.987
## 275 214 visit_3 0.942
## 276 215 visit_3 0.932
## 277 301 visit_3 0.977
## 278 302 visit_3 0.918
## 279 303 visit_3 0.962
## 280 305 visit_3 NA
## 281 306 visit_3 NA
## 282 307 visit_3 0.951
## 283 308 visit_3 NA
## 284 309 visit_3 0.960
## 285 310 visit_3 0.889
## 286 311 visit_3 0.889
## 287 312 visit_3 0.900
## 288 313 visit_3 0.955
## 289 314 visit_3 0.909
## 290 315 visit_3 0.875
## 291 316 visit_3 0.860
## 292 317 visit_3 0.913
## 293 318 visit_3 0.904
## 294 319 visit_3 0.968
## 295 320 visit_3 0.936
## 296 321 visit_3 NA
## 297 322 visit_3 0.882
## 298 323 visit_3 0.922
## 299 324 visit_3 0.829
## 300 325 visit_3 0.859
## 301 326 visit_3 0.908
## 302 327 visit_3 0.873
## 303 328 visit_3 0.884
## 304 329 visit_3 NA
## 305 330 visit_3 NA
## 306 331 visit_3 0.985
## 307 401 visit_3 NA
## 308 402 visit_3 0.963
## 309 403 visit_3 1.041
## 310 404 visit_3 1.038
## 311 405 visit_3 1.080
## 312 406 visit_3 0.991
## 313 407 visit_3 1.023
## 314 408 visit_3 0.955
## 315 409 visit_3 0.973
## 316 410 visit_3 0.915
## 317 411 visit_3 1.052
## 318 412 visit_3 0.981
## 319 413 visit_3 0.981
## 320 414 visit_3 0.965
## 321 415 visit_3 1.028
## 322 416 visit_3 1.067
## 323 417 visit_3 1.036
## 324 418 visit_3 0.996
## 325 419 visit_3 0.933
## 326 420 visit_3 0.994
## 327 421 visit_3 1.065
## 328 422 visit_3 NA
## 329 423 visit_3 0.894
## 330 424 visit_3 0.922
## 331 425 visit_3 0.999
## 332 426 visit_3 0.975
## 333 427 visit_3 0.975
## 334 428 visit_3 1.062
## 335 429 visit_3 0.990
## 336 430 visit_3 1.024
## 337 101 visit_4 0.952
## 338 102 visit_4 0.881
## 339 103 visit_4 0.855
## 340 104 visit_4 0.920
## 341 105 visit_4 0.955
## 342 106 visit_4 0.908
## 343 107 visit_4 0.809
## 344 108 visit_4 NA
## 345 109 visit_4 0.879
## 346 110 visit_4 0.885
## 347 111 visit_4 0.952
## 348 112 visit_4 0.847
## 349 113 visit_4 0.907
## 350 114 visit_4 0.861
## 351 115 visit_4 0.898
## 352 116 visit_4 NA
## 353 117 visit_4 NA
## 354 118 visit_4 NA
## 355 119 visit_4 0.839
## 356 120 visit_4 0.893
## 357 121 visit_4 0.893
## 358 122 visit_4 0.955
## 359 123 visit_4 0.933
## 360 124 visit_4 0.898
## 361 125 visit_4 0.888
## 362 126 visit_4 0.840
## 363 127 visit_4 0.860
## 364 128 visit_4 NA
## 365 129 visit_4 0.924
## 366 130 visit_4 0.896
## 367 131 visit_4 0.914
## 368 132 visit_4 0.830
## 369 133 visit_4 0.982
## 370 134 visit_4 0.822
## 371 135 visit_4 NA
## 372 136 visit_4 0.909
## 373 137 visit_4 0.822
## 374 201 visit_4 0.947
## 375 202 visit_4 0.991
## 376 203 visit_4 0.986
## 377 204 visit_4 0.944
## 378 205 visit_4 0.938
## 379 206 visit_4 0.999
## 380 207 visit_4 1.058
## 381 208 visit_4 0.959
## 382 209 visit_4 NA
## 383 210 visit_4 NA
## 384 211 visit_4 NA
## 385 212 visit_4 1.063
## 386 213 visit_4 1.026
## 387 214 visit_4 0.974
## 388 215 visit_4 0.951
## 389 301 visit_4 0.995
## 390 302 visit_4 0.942
## 391 303 visit_4 0.997
## 392 305 visit_4 NA
## 393 306 visit_4 NA
## 394 307 visit_4 0.992
## 395 308 visit_4 NA
## 396 309 visit_4 0.986
## 397 310 visit_4 0.912
## 398 311 visit_4 0.903
## 399 312 visit_4 0.938
## 400 313 visit_4 1.014
## 401 314 visit_4 0.966
## 402 315 visit_4 0.934
## 403 316 visit_4 0.883
## 404 317 visit_4 0.919
## 405 318 visit_4 0.935
## 406 319 visit_4 0.999
## 407 320 visit_4 0.946
## 408 321 visit_4 NA
## 409 322 visit_4 0.907
## 410 323 visit_4 0.932
## 411 324 visit_4 0.855
## 412 325 visit_4 0.905
## 413 326 visit_4 0.907
## 414 327 visit_4 0.905
## 415 328 visit_4 0.887
## 416 329 visit_4 NA
## 417 330 visit_4 NA
## 418 331 visit_4 1.026
## 419 401 visit_4 NA
## 420 402 visit_4 0.975
## 421 403 visit_4 1.087
## 422 404 visit_4 NA
## 423 405 visit_4 1.114
## 424 406 visit_4 1.002
## 425 407 visit_4 1.050
## 426 408 visit_4 0.972
## 427 409 visit_4 0.987
## 428 410 visit_4 0.923
## 429 411 visit_4 1.092
## 430 412 visit_4 1.005
## 431 413 visit_4 1.009
## 432 414 visit_4 0.971
## 433 415 visit_4 1.046
## 434 416 visit_4 1.096
## 435 417 visit_4 1.044
## 436 418 visit_4 1.016
## 437 419 visit_4 0.945
## 438 420 visit_4 1.038
## 439 421 visit_4 1.113
## 440 422 visit_4 NA
## 441 423 visit_4 0.914
## 442 424 visit_4 0.935
## 443 425 visit_4 1.035
## 444 426 visit_4 1.010
## 445 427 visit_4 1.012
## 446 428 visit_4 1.073
## 447 429 visit_4 1.020
## 448 430 visit_4 1.054
## 449 101 visit_5 0.970
## 450 102 visit_5 0.901
## 451 103 visit_5 0.895
## 452 104 visit_5 0.948
## 453 105 visit_5 1.002
## 454 106 visit_5 0.933
## 455 107 visit_5 0.823
## 456 108 visit_5 NA
## 457 109 visit_5 0.908
## 458 110 visit_5 0.923
## 459 111 visit_5 NA
## 460 112 visit_5 0.862
## 461 113 visit_5 0.934
## 462 114 visit_5 0.889
## 463 115 visit_5 0.913
## 464 116 visit_5 NA
## 465 117 visit_5 NA
## 466 118 visit_5 NA
## 467 119 visit_5 0.852
## 468 120 visit_5 0.921
## 469 121 visit_5 0.929
## 470 122 visit_5 0.972
## 471 123 visit_5 0.970
## 472 124 visit_5 0.924
## 473 125 visit_5 0.920
## 474 126 visit_5 0.863
## 475 127 visit_5 0.932
## 476 128 visit_5 NA
## 477 129 visit_5 0.961
## 478 130 visit_5 0.904
## 479 131 visit_5 0.952
## 480 132 visit_5 0.849
## 481 133 visit_5 0.993
## 482 134 visit_5 0.816
## 483 135 visit_5 NA
## 484 136 visit_5 0.930
## 485 137 visit_5 0.833
## 486 201 visit_5 0.953
## 487 202 visit_5 1.020
## 488 203 visit_5 0.987
## 489 204 visit_5 0.994
## 490 205 visit_5 0.972
## 491 206 visit_5 1.023
## 492 207 visit_5 1.063
## 493 208 visit_5 0.992
## 494 209 visit_5 NA
## 495 210 visit_5 NA
## 496 211 visit_5 NA
## 497 212 visit_5 1.076
## 498 213 visit_5 1.047
## 499 214 visit_5 0.983
## 500 215 visit_5 0.973
## 501 301 visit_5 0.988
## 502 302 visit_5 0.982
## 503 303 visit_5 0.999
## 504 305 visit_5 NA
## 505 306 visit_5 NA
## 506 307 visit_5 0.992
## 507 308 visit_5 NA
## 508 309 visit_5 1.017
## 509 310 visit_5 0.913
## 510 311 visit_5 0.924
## 511 312 visit_5 0.965
## 512 313 visit_5 1.022
## 513 314 visit_5 0.981
## 514 315 visit_5 0.961
## 515 316 visit_5 0.894
## 516 317 visit_5 0.926
## 517 318 visit_5 NA
## 518 319 visit_5 0.999
## 519 320 visit_5 0.950
## 520 321 visit_5 NA
## 521 322 visit_5 0.912
## 522 323 visit_5 0.971
## 523 324 visit_5 0.868
## 524 325 visit_5 0.946
## 525 326 visit_5 0.922
## 526 327 visit_5 0.912
## 527 328 visit_5 0.931
## 528 329 visit_5 NA
## 529 330 visit_5 NA
## 530 331 visit_5 1.057
## 531 401 visit_5 NA
## 532 402 visit_5 0.984
## 533 403 visit_5 1.120
## 534 404 visit_5 NA
## 535 405 visit_5 1.104
## 536 406 visit_5 1.011
## 537 407 visit_5 1.053
## 538 408 visit_5 0.988
## 539 409 visit_5 0.994
## 540 410 visit_5 0.952
## 541 411 visit_5 1.084
## 542 412 visit_5 1.005
## 543 413 visit_5 1.022
## 544 414 visit_5 0.980
## 545 415 visit_5 1.068
## 546 416 visit_5 1.119
## 547 417 visit_5 1.112
## 548 418 visit_5 1.012
## 549 419 visit_5 NA
## 550 420 visit_5 1.022
## 551 421 visit_5 1.126
## 552 422 visit_5 NA
## 553 423 visit_5 0.927
## 554 424 visit_5 0.988
## 555 425 visit_5 1.066
## 556 426 visit_5 1.014
## 557 427 visit_5 1.023
## 558 428 visit_5 1.083
## 559 429 visit_5 1.051
## 560 430 visit_5 1.071
# Calculate descriptive statistics
calcium %>%
group_by(visit, group) %>%
summarize(avg_bmd = mean(bmd, na.rm = TRUE), median_bmd = median(bmd, na.rm = TRUE),
minimum_bmd = min(bmd, na.rm = TRUE), maximum_bmd = max(bmd, na.rm = TRUE),
standev_bmd = sd(bmd, na.rm = TRUE), num_miss = sum(is.na(bmd)), n = n()
)
## # A tibble: 10 x 9
## # Groups: visit [5]
## visit group avg_bmd median_bmd minimum_bmd maximum_bmd standev_bmd num_miss
## <int> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 1 C 0.880 0.879 0.762 1.03 0.0597 0
## 2 1 P 0.870 0.861 0.746 1.01 0.0658 0
## 3 2 C 0.903 0.905 0.769 1.05 0.0593 0
## 4 2 P 0.890 0.869 0.748 1.06 0.0751 0
## 5 3 C 0.938 0.938 0.816 1.08 0.0588 0
## 6 3 P 0.914 0.904 0.756 1.07 0.0780 0
## 7 4 C 0.964 0.955 0.83 1.11 0.0651 0
## 8 4 P 0.942 0.925 0.809 1.10 0.0753 0
## 9 5 C 0.988 0.986 0.849 1.13 0.0629 0
## 10 5 P 0.958 0.95 0.816 1.12 0.0736 0
## # ... with 1 more variable: n <int>
# Visualize distributions of outcome over time
ggplot(calcium, aes(x = factor(visit), y = bmd)) +
geom_violin(aes(fill=group)) +
xlab("Visit Number") +
ylab("Bone Mineral Density") +
theme_bw(base_size = 16)
Chapter 2 - Modeling Continuous Longitudinal Outcomes
Longitudinal Analysis for Continuous Outcomes:
Addition of Random Slope Terms:
Visualize and Interpret Output:
variance <- VarCorr(object) if(intercept_only) { random_matrix <- as.matrix(object@pp$X[1:num_timepoints, 1]) var_cor <- random_matrix %*% variance[[1]][1] %*% t(random_matrix) + diag(attr(variance, "sc")^2, nrow = num_timepoints, ncol = num_timepoints) } else { random_matrix <- as.matrix(object@pp$X[1:num_timepoints, ]) var_cor <- random_matrix %*% variance[[1]][1:2, 1:2] %*% t(random_matrix) + diag(attr(variance, "sc")^2, nrow = num_timepoints, ncol = num_timepoints) } Matrix::cov2cor(var_cor) Example code includes:
# Visualize trajectories
ggplot(calcium, aes(x = visit, y = bmd)) +
geom_line(aes(group = person), alpha = .4) +
geom_smooth(se = FALSE, size = 2) +
theme_bw(base_size = 14) +
xlab('Visit Number') +
ylab('Bone Mineral Density (g/cm^2)')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Unconditional model
uncond_model <- lme4::lmer(bmd ~ 1 + visit + (1 | person), data = calcium)
# Show model output
summary(uncond_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: bmd ~ 1 + visit + (1 | person)
## Data: calcium
##
## REML criterion at convergence: -2232.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.7257 -0.5815 0.0097 0.5980 3.0246
##
## Random effects:
## Groups Name Variance Std.Dev.
## person (Intercept) 0.0044907 0.06701
## Residual 0.0002475 0.01573
## Number of obs: 501, groups: person, 112
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.8503372 0.0065326 130.2
## visit 0.0247851 0.0005131 48.3
##
## Correlation of Fixed Effects:
## (Intr)
## visit -0.215
# Alter the visit variable to start at 0
calcium <- calcium %>%
mutate(visit_0 = visit - 1)
# Fit random intercept model with new time variable
uncond_model_0 <- lme4::lmer(bmd ~ 1 + visit_0 + (1 | person), data = calcium)
summary(uncond_model_0)
## Linear mixed model fit by REML ['lmerMod']
## Formula: bmd ~ 1 + visit_0 + (1 | person)
## Data: calcium
##
## REML criterion at convergence: -2232.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.7257 -0.5815 0.0097 0.5980 3.0246
##
## Random effects:
## Groups Name Variance Std.Dev.
## person (Intercept) 0.0044907 0.06701
## Residual 0.0002475 0.01573
## Number of obs: 501, groups: person, 112
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.8751223 0.0064416 135.9
## visit_0 0.0247851 0.0005131 48.3
##
## Correlation of Fixed Effects:
## (Intr)
## visit_0 -0.139
# Random slope
uncond_model_rs <- lme4::lmer(bmd ~ 1 + visit_0 + (1 + visit_0 | person), data = calcium)
summary(uncond_model_rs)
## Linear mixed model fit by REML ['lmerMod']
## Formula: bmd ~ 1 + visit_0 + (1 + visit_0 | person)
## Data: calcium
##
## REML criterion at convergence: -2351.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.2961 -0.5334 -0.0050 0.5035 2.2933
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## person (Intercept) 4.152e-03 0.064438
## visit_0 5.239e-05 0.007238 0.14
## Residual 1.247e-04 0.011167
## Number of obs: 501, groups: person, 112
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.8752292 0.0061471 142.4
## visit_0 0.0245982 0.0008146 30.2
##
## Correlation of Fixed Effects:
## (Intr)
## visit_0 0.066
# Compare random slopes and random intercept only models
anova(uncond_model_rs, uncond_model_0)
## refitting model(s) with ML (instead of REML)
## Data: calcium
## Models:
## uncond_model_0: bmd ~ 1 + visit_0 + (1 | person)
## uncond_model_rs: bmd ~ 1 + visit_0 + (1 + visit_0 | person)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## uncond_model_0 4 -2246.0 -2229.2 1127.0 -2254.0
## uncond_model_rs 6 -2359.8 -2334.5 1185.9 -2371.8 117.79 2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Create predicted values for random intercept only model
calcium_vis <- calcium %>%
mutate(pred_values_ri = predict(uncond_model_0))
# Visualize random intercepts
ggplot(calcium_vis, aes(x = visit_0, y = pred_values_ri)) +
geom_line(size = 1, color = 'gray70', aes(group = person)) +
theme_bw() +
xlab("Visit Number") +
ylab("Model Predicted Bone Mineral Density (g/cm^2)")
# Create predicted values for random intercept and slope model
calcium_vis <- calcium %>%
mutate(pred_values_rs = predict(uncond_model_rs))
# Visualize random intercepts and slopes
ggplot(calcium_vis, aes(x = visit_0, y = pred_values_rs)) +
geom_line(size = 1, color = 'gray70', aes(group = person)) +
theme_bw() +
xlab("Visit Number") +
ylab("Model Predicted Bone Mineral Density (g/cm^2)")
corr_structure <- function(object, num_timepoints, intercept_only = TRUE) {
variance <- lme4::VarCorr(object)
if(intercept_only) {
random_matrix <- as.matrix(object@pp$X[1:num_timepoints, 1])
var_cor <- random_matrix %*% variance[[1]][1] %*% t(random_matrix) + diag(attr(variance, "sc")^2, nrow = num_timepoints, ncol = num_timepoints)
} else {
random_matrix <- as.matrix(object@pp$X[1:num_timepoints, ])
var_cor <- random_matrix %*% variance[[1]][1:2, 1:2] %*% t(random_matrix) + diag(attr(variance, "sc")^2, nrow = num_timepoints, ncol = num_timepoints)
}
Matrix::cov2cor(var_cor)
}
# Random intercept and slope model
random_slope <- lme4::lmer(bmd ~ 1 + visit_0 + (1 + visit_0 | person), data = calcium)
# Generate model implied correlation matrix
mod_corr <- corr_structure(random_slope, num_timepoints = 5, intercept_only = FALSE)
round(mod_corr, 3)
## 1 2 3 4 5
## 1 1.000 0.966 0.950 0.927 0.899
## 2 0.966 1.000 0.968 0.955 0.935
## 3 0.950 0.968 1.000 0.970 0.959
## 4 0.927 0.955 0.970 1.000 0.973
## 5 0.899 0.935 0.959 0.973 1.000
# Create visualization for correlation structure
GGally::ggcorr(data = NULL, cor_matrix = mod_corr, midpoint = NULL,
limits = NULL, label = TRUE, label_round = 3, label_size = 5,
nbreaks = 100, palette = 'PuBuGn'
)
## Color gradient midpoint set at median correlation to 0.96
Chapter 3 - Add Fixed Predictor Variables
Adding Predictors:
Adding Predictors - Interactions:
Model Comparisons and Eplained Variance:
Example code includes:
# Add a categorical predictor
bmd_group <- lme4::lmer(bmd ~ 1 + visit_0 + group + (1 + visit_0 | person), data = calcium)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00541016 (tol = 0.002, component 1)
summary(bmd_group)
## Linear mixed model fit by REML ['lmerMod']
## Formula: bmd ~ 1 + visit_0 + group + (1 + visit_0 | person)
## Data: calcium
##
## REML criterion at convergence: -2344.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.30713 -0.53193 -0.00697 0.50756 2.29630
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## person (Intercept) 4.158e-03 0.064481
## visit_0 5.233e-05 0.007234 0.11
## Residual 1.247e-04 0.011169
## Number of obs: 501, groups: person, 112
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.8803219 0.0087733 100.341
## visit_0 0.0245961 0.0008144 30.201
## groupP -0.0100098 0.0122917 -0.814
##
## Correlation of Fixed Effects:
## (Intr) vist_0
## visit_0 0.031
## groupP -0.713 0.001
## convergence code: 0
## Model failed to converge with max|grad| = 0.00541016 (tol = 0.002, component 1)
# Add a continuous predictor
bmd_group_age <- lme4::lmer(bmd ~ 1 + visit_0 + group + age + (1 + visit_0 | person), data = calcium)
summary(bmd_group_age)
## Linear mixed model fit by REML ['lmerMod']
## Formula: bmd ~ 1 + visit_0 + group + age + (1 + visit_0 | person)
## Data: calcium
##
## REML criterion at convergence: -2343.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.30240 -0.49803 -0.01591 0.51495 2.38624
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## person (Intercept) 0.0042073 0.06486
## visit_0 0.0000490 0.00700 0.12
## Residual 0.0001244 0.01115
## Number of obs: 501, groups: person, 112
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.507361 0.153645 3.302
## visit_0 0.007942 0.006901 1.151
## groupP -0.009896 0.012361 -0.801
## age 0.033701 0.013864 2.431
##
## Correlation of Fixed Effects:
## (Intr) vist_0 groupP
## visit_0 0.992
## groupP -0.039 0.002
## age -0.998 -0.993 -0.002
# Calculate aggregate trends
calcium_agg <- calcium %>%
mutate(pred_values = predict(bmd_group_age, re.form = NA)) %>%
group_by(visit_0, group) %>%
summarize(pred_group = mean(pred_values))
# Visualize the model results
ggplot(calcium_agg, aes(x = visit_0, y = pred_group, color = group)) +
geom_point(data = calcium, aes(x = visit_0, y = bmd, color = group)) +
geom_line(size = 1.25) +
xlab('Visit Number') +
ylab('Model Predicted Bone Mineral Density (g/cm^2)')
# Add an interaction
bmd_group_age_int <- lme4::lmer(bmd ~ 1 + visit_0 + age + group + visit_0:group + visit_0:age + (1 + visit_0 | person), data = calcium)
summary(bmd_group_age_int)
## Linear mixed model fit by REML ['lmerMod']
## Formula: bmd ~ 1 + visit_0 + age + group + visit_0:group + visit_0:age +
## (1 + visit_0 | person)
## Data: calcium
##
## REML criterion at convergence: -2331.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.22416 -0.52727 -0.01329 0.52812 2.29995
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## person (Intercept) 4.219e-03 0.064953
## visit_0 4.465e-05 0.006682 0.12
## Residual 1.231e-04 0.011096
## Number of obs: 501, groups: person, 112
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.4582089 0.1540324 2.975
## visit_0 0.0235327 0.0097236 2.420
## age 0.0381262 0.0138911 2.745
## groupP -0.0116086 0.0123912 -0.937
## visit_0:groupP -0.0044210 0.0015302 -2.889
## visit_0:age -0.0011939 0.0006104 -1.956
##
## Correlation of Fixed Effects:
## (Intr) vist_0 age groupP vs_0:P
## visit_0 0.579
## age -0.998 -0.582
## groupP -0.039 -0.003 -0.002
## vist_0:grpP -0.018 -0.088 0.017 0.049
## visit_0:age 0.157 -0.706 -0.154 0.000 -0.005
# Add residuals original data
calcium <- calcium %>%
mutate(model_residuals = residuals(bmd_group_age_int))
# Visualize residuals
ggplot(calcium, aes(x = model_residuals)) +
geom_density(aes(color = group), size = 1.25) +
theme_bw(base_size = 14) +
xlab("Model Residuals")
# Extract random effects
random_effects <- lme4::ranef(bmd_group_age_int)$person %>%
mutate(id = 1:n()) %>%
gather("variable", "value", -id)
# Visualize random effects
ggplot(random_effects, aes(sample = value)) +
geom_qq() +
geom_qq_line() +
facet_wrap(~variable, scales = 'free_y') +
theme_bw(base_size = 14)
# Compare random slope, model with group variable, model with group and age, and model with interactions.
AICcmodavg::aictab(list(uncond_model_rs, bmd_group, bmd_group_age, bmd_group_age_int),
modnames = c('random slope', 'group intercept', 'group and age',
'group and age interaction'
)
)
## Warning in aictab.AIClmerMod(list(uncond_model_rs, bmd_group, bmd_group_age, :
## Model selection for fixed effects is only appropriate with ML estimation:
## REML (default) should only be used to select random effects for a constant set of fixed effects
##
## Model selection based on AICc:
##
## K AICc Delta_AICc AICcWt Cum.Wt Res.LL
## random slope 6 -2338.91 0.00 0.98 0.98 1175.54
## group intercept 7 -2330.51 8.40 0.01 1.00 1172.37
## group and age 8 -2327.48 11.42 0.00 1.00 1171.89
## group and age interaction 10 -2311.18 27.73 0.00 1.00 1165.81
# Compute explained variance for random slope only model
MuMIn::r.squaredGLMM(uncond_model_rs)
## Warning: 'r.squaredGLMM' now calculates a revised statistic. See the help page.
## R2m R2c
## [1,] 0.2016978 0.9793256
# Compute explained variance for group and age predicting intercepts model
MuMIn::r.squaredGLMM(bmd_group_age)
## R2m R2c
## [1,] 0.206134 0.9794822
# Compute explained variance for interaction model
MuMIn::r.squaredGLMM(bmd_group_age_int)
## R2m R2c
## [1,] 0.2177605 0.9799048
Chapter 4 - Modeling Longitudinal Dichotomous Outcomes
Exploring and Modeling Dichotomous Outcomes:
Generalized Estimating Functions (GEE):
Model Selection:
Interpreting and Visualizing Model Results:
Example code includes:
ids <- rep(c(1:82, 85:87, 90), each=12)
months <- rep(0:11, times=length(unique(ids)))
symps <- c(1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, NA, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, NA, NA, NA, NA, NA, NA, NA, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0)
symps <- c(symps, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, NA, NA, NA, NA, NA, NA, NA, NA, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, NA, NA, NA, NA, NA, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, NA, NA, NA, NA, 0, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, NA, NA, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, NA, NA, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0)
age <- rep(c('Less than 20', 'Less than 20', 'Less than 20', '20 or Older', 'Less than 20', '20 or Older', 'Less than 20', 'Less than 20', '20 or Older', 'Less than 20', '20 or Older', '20 or Older', 'Less than 20', 'Less than 20', 'Less than 20', 'Less than 20', '20 or Older', '20 or Older', 'Less than 20', 'Less than 20', 'Less than 20', 'Less than 20', 'Less than 20', '20 or Older', '20 or Older', 'Less than 20', '20 or Older', 'Less than 20', '20 or Older', 'Less than 20', '20 or Older', 'Less than 20', 'Less than 20', 'Less than 20', '20 or Older', '20 or Older', 'Less than 20', 'Less than 20', 'Less than 20', '20 or Older', '20 or Older', '20 or Older', 'Less than 20', 'Less than 20', 'Less than 20', 'Less than 20', '20 or Older', 'Less than 20', 'Less than 20', 'Less than 20', '20 or Older', 'Less than 20', 'Less than 20', 'Less than 20', 'Less than 20', '20 or Older', 'Less than 20', 'Less than 20', 'Less than 20', '20 or Older', 'Less than 20', '20 or Older', 'Less than 20', '20 or Older', '20 or Older', 'Less than 20', 'Less than 20', 'Less than 20', 'Less than 20', 'Less than 20', 'Less than 20', 'Less than 20', '20 or Older', 'Less than 20', '20 or Older', 'Less than 20', '20 or Older', '20 or Older', 'Less than 20', 'Less than 20', '20 or Older', 'Less than 20', '20 or Older', 'Less than 20', 'Less than 20', 'Less than 20'), each=12)
sex <- rep(c('Male', 'Female', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Female', 'Female', 'Female', 'Male', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Female', 'Female', 'Female', 'Female', 'Female', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Male', 'Male'), each=12)
madras <- data.frame(id=ids, symptom=symps, month=months, age=age, sex=sex)
# Explore the first few rows of the madras data
str(madras)
## 'data.frame': 1032 obs. of 5 variables:
## $ id : num 1 1 1 1 1 1 1 1 1 1 ...
## $ symptom: num 1 1 1 1 1 0 0 0 0 0 ...
## $ month : int 0 1 2 3 4 5 6 7 8 9 ...
## $ age : Factor w/ 2 levels "20 or Older",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
head(madras)
## id symptom month age sex
## 1 1 1 0 Less than 20 Male
## 2 1 1 1 Less than 20 Male
## 3 1 1 2 Less than 20 Male
## 4 1 1 3 Less than 20 Male
## 5 1 1 4 Less than 20 Male
## 6 1 0 5 Less than 20 Male
# Descriptives about symptom prevalence over time
summary_stats <- madras %>%
group_by(month) %>%
summarize(num_symptom = sum(symptom, na.rm = TRUE),
num = n(),
prop_symptom = mean(symptom, na.rm = TRUE)
)
# Print out summary statistics
summary_stats
## # A tibble: 12 x 4
## month num_symptom num prop_symptom
## <int> <dbl> <int> <dbl>
## 1 0 56 86 0.651
## 2 1 51 86 0.6
## 3 2 43 86 0.518
## 4 3 37 86 0.462
## 5 4 28 86 0.359
## 6 5 20 86 0.263
## 7 6 13 86 0.171
## 8 7 10 86 0.133
## 9 8 7 86 0.0972
## 10 9 8 86 0.111
## 11 10 6 86 0.0857
## 12 11 6 86 0.0870
# Build models
uncond_ri <- lme4::glmer(symptom ~ 1 + month + (1|id), data = madras, family = binomial)
summary(uncond_ri)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: symptom ~ 1 + month + (1 | id)
## Data: madras
##
## AIC BIC logLik deviance df.resid
## 764.0 778.5 -379.0 758.0 919
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.7679 -0.3539 -0.1316 0.3253 6.3572
##
## Random effects:
## Groups Name Variance Std.Dev.
## id (Intercept) 4.961 2.227
## Number of obs: 922, groups: id, 86
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.07446 0.30866 3.481 0.000499 ***
## month -0.54049 0.04451 -12.143 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## month -0.457
# Add in covariates based on trend plot
cond_model <- lme4::glmer(symptom ~ 1 + month + sex + age + sex:age + sex:month + (1 | id), data = madras, family = binomial)
# Generate summary of output
summary(cond_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: symptom ~ 1 + month + sex + age + sex:age + sex:month + (1 | id)
## Data: madras
##
## AIC BIC logLik deviance df.resid
## 760.2 794.0 -373.1 746.2 915
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.7862 -0.3770 -0.1273 0.3201 9.4814
##
## Random effects:
## Groups Name Variance Std.Dev.
## id (Intercept) 4.305 2.075
## Number of obs: 922, groups: id, 86
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.60467 0.60890 2.635 0.0084 **
## month -0.63718 0.07406 -8.604 <2e-16 ***
## sexMale -0.55037 0.92853 -0.593 0.5534
## ageLess than 20 -1.49168 0.75966 -1.964 0.0496 *
## sexMale:ageLess than 20 1.98705 1.09623 1.813 0.0699 .
## month:sexMale 0.15234 0.08694 1.752 0.0797 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) month sexMal agLt20 sM:Lt2
## month -0.415
## sexMale -0.655 0.270
## ageLssthn20 -0.707 0.120 0.462
## sxMl:gLst20 0.492 -0.093 -0.762 -0.697
## month:sexMl 0.342 -0.800 -0.352 -0.080 0.057
# Fit a GEE model with intercept and time variable
gee_mod <- geepack::geeglm(symptom ~ 1 + month, id = id, family=binomial, data = madras, scale.fix = TRUE)
# Extract model results
summary(gee_mod)
##
## Call:
## geepack::geeglm(formula = symptom ~ 1 + month, family = binomial,
## data = madras, id = id, scale.fix = TRUE)
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) 0.67587 0.21077 10.28 0.00134 **
## month -0.32535 0.04452 53.42 2.7e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation structure = independence
## Scale is fixed.
##
## Number of clusters: 86 Maximum cluster size: 12
# Fit a GEE model with an ar(1) working correlation matrix
gee_mod_ar1 <- geepack::geeglm(symptom ~ 1 + month, id = id, family = binomial, data = madras, corstr="ar1", scale.fix = TRUE)
# Fit a GEE model with an unstructured working correlation matrix
gee_mod_un <- geepack::geeglm(symptom ~ 1 + month, id = id, family = binomial, data = madras, corstr="unstructured", scale.fix = TRUE)
# Extract model results
summary(gee_mod_ar1)
##
## Call:
## geepack::geeglm(formula = symptom ~ 1 + month, family = binomial,
## data = madras, id = id, corstr = "ar1", scale.fix = TRUE)
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) 0.634 0.199 10.1 0.0015 **
## month -0.308 0.041 56.6 5.3e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation structure = ar1
## Scale is fixed.
##
## Link = identity
##
## Estimated Correlation Parameters:
## Estimate Std.err
## alpha 0.678 0.0451
## Number of clusters: 86 Maximum cluster size: 12
summary(gee_mod_un)
##
## Call:
## geepack::geeglm(formula = symptom ~ 1 + month, family = binomial,
## data = madras, id = id, corstr = "unstructured", scale.fix = TRUE)
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -1.607 0.950 2.86 0.091 .
## month 1.008 0.578 3.03 0.082 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation structure = unstructured
## Scale is fixed.
##
## Link = identity
##
## Estimated Correlation Parameters:
## Estimate Std.err
## alpha.1:2 1.341 1.03e+00
## alpha.1:3 0.403 5.58e-01
## alpha.1:4 -0.497 5.61e-01
## alpha.1:5 -1.882 6.12e-01
## alpha.1:6 -4.397 3.73e+00
## alpha.1:7 -9.542 1.26e+01
## alpha.1:8 -16.387 2.79e+01
## alpha.1:9 -28.267 5.83e+01
## alpha.1:10 -45.536 1.09e+02
## alpha.1:11 -77.518 2.10e+02
## alpha.1:12 -126.572 3.83e+02
## alpha.2:3 0.547 1.52e-01
## alpha.2:4 0.248 4.80e-01
## alpha.2:5 -0.446 6.41e-01
## alpha.2:6 -1.154 6.25e-01
## alpha.2:7 -2.836 1.89e+00
## alpha.2:8 -6.135 7.38e+00
## alpha.2:9 -12.003 1.95e+01
## alpha.2:10 -19.110 3.80e+01
## alpha.2:11 -30.538 7.06e+01
## alpha.2:12 -49.110 1.29e+02
## alpha.3:4 0.962 5.84e-01
## alpha.3:5 1.070 1.44e+00
## alpha.3:6 1.411 2.71e+00
## alpha.3:7 1.397 3.87e+00
## alpha.3:8 2.825 7.08e+00
## alpha.3:9 4.176 1.12e+01
## alpha.3:10 6.223 1.71e+01
## alpha.3:11 8.645 2.56e+01
## alpha.3:12 15.820 4.76e+01
## alpha.4:5 2.990 3.42e+00
## alpha.4:6 4.473 6.70e+00
## alpha.4:7 6.695 1.20e+01
## alpha.4:8 12.036 2.41e+01
## alpha.4:9 19.898 4.44e+01
## alpha.4:10 31.794 7.92e+01
## alpha.4:11 54.121 1.48e+02
## alpha.4:12 91.655 2.75e+02
## alpha.5:6 10.186 1.73e+01
## alpha.5:7 15.764 3.14e+01
## alpha.5:8 27.066 6.09e+01
## alpha.5:9 46.677 1.17e+02
## alpha.5:10 77.419 2.15e+02
## alpha.5:11 131.788 4.02e+02
## alpha.5:12 215.885 7.19e+02
## alpha.6:7 34.951 7.97e+01
## alpha.6:8 57.428 1.47e+02
## alpha.6:9 95.123 2.70e+02
## alpha.6:10 157.530 4.90e+02
## alpha.6:11 262.681 8.91e+02
## alpha.6:12 432.122 1.59e+03
## alpha.7:8 106.445 3.04e+02
## alpha.7:9 170.915 5.38e+02
## alpha.7:10 282.934 9.69e+02
## alpha.7:11 481.633 1.78e+03
## alpha.7:12 793.897 3.17e+03
## alpha.8:9 314.324 1.08e+03
## alpha.8:10 503.088 1.87e+03
## alpha.8:11 841.802 3.38e+03
## alpha.8:12 1388.948 5.97e+03
## alpha.9:10 861.042 3.46e+03
## alpha.9:11 1465.730 6.30e+03
## alpha.9:12 2420.508 1.11e+04
## alpha.10:11 2465.610 1.13e+04
## alpha.10:12 4072.802 1.99e+04
## alpha.11:12 6961.365 3.59e+04
## Number of clusters: 86 Maximum cluster size: 12
# Fit a GEE model with an ar(1) working correlation matrix
gee_mod_ar1 <- geepack::geeglm(symptom ~ 1 + month + age + sex + age:sex + month:age + month:sex, id = id, data = madras, family = binomial, corstr = 'ar1', scale.fix = TRUE)
# Extract model results
summary(gee_mod_ar1)
##
## Call:
## geepack::geeglm(formula = symptom ~ 1 + month + age + sex + age:sex +
## month:age + month:sex, family = binomial, data = madras,
## id = id, corstr = "ar1", scale.fix = TRUE)
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) 1.3287 0.4784 7.71 0.0055 **
## month -0.4819 0.0792 37.07 1.1e-09 ***
## ageLess than 20 -1.0874 0.5472 3.95 0.0469 *
## sexMale -0.6198 0.6748 0.84 0.3583
## ageLess than 20:sexMale 1.0846 0.7136 2.31 0.1285
## month:ageLess than 20 0.0637 0.0895 0.51 0.4766
## month:sexMale 0.1763 0.0980 3.24 0.0720 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation structure = ar1
## Scale is fixed.
##
## Link = identity
##
## Estimated Correlation Parameters:
## Estimate Std.err
## alpha 0.624 0.0413
## Number of clusters: 86 Maximum cluster size: 12
# Fit a GEE model with an exchangeable working correlation matrix
gee_mod_exch <- geepack::geeglm(symptom ~ 1 + month + age + sex, data = madras, id = id, family = binomial, scale.fix = TRUE, corstr = 'exchangeable')
glmm_age <- lme4::glmer(symptom ~ 1 + month + age + month:age + (1 | id), data=madras, family=binomial)
# Generate model implied probabilities
madras <- madras %>%
na.omit() %>%
mutate(prob=predict(glmm_age, type="response"))
# Visualize subject specific probabilities
ggplot(madras, aes(x=month, y=prob)) +
geom_line(aes(group=id)) +
theme_bw() +
xlab("Month") +
ylab("Probabilities")
# Compute the average trajectories
glmm_prob <- madras %>%
group_by(month, age) %>%
summarize(prob = mean(prob))
# Visualize average trajectories
ggplot(glmm_prob, aes(x = month, y = prob)) +
geom_line(aes(color = age)) +
theme_bw() +
xlab("Month") +
ylab("Probability")
gee_age_sex <- geepack::geeglm(formula = symptom ~ 1 + month + age + month:age + sex,
family = binomial, data = madras, id = id, corstr = "ar1", scale.fix = TRUE
)
# Compute model implied probabilites using gee_age_sex
madras_gee <- madras %>%
select(month, symptom, age, sex) %>%
na.omit() %>%
mutate(prob = predict(gee_age_sex, type = "response"))
# Visualize trajectories
ggplot(madras_gee, aes(x = month, y = prob)) +
geom_line(aes(color = age)) +
facet_wrap(~ sex) +
theme_bw() +
xlab("Month") +
ylab("Probability")
# Fit a GLMM mdoel
glmm_age_sex <- lme4::glmer(symptom ~ 1 + month + age + sex + (1 | id), data = madras, family = binomial)
# Generate model implied probabilites
madras <- madras %>% mutate(prob_gee = predict(gee_age_sex, type = "response"),
prob_glmm = predict(glmm_age_sex, type = "response")
)
# Compute average GEE probabilities
madras_gee <- madras %>%
group_by(month, age, sex) %>%
summarize(prob = mean(prob_gee))
# Compute average GLMM probabilities
madras_glmm <- madras %>%
group_by(month, age, sex) %>%
summarize(prob = mean(prob_glmm))
# Create combined data object
madras_agg = bind_rows(
mutate(madras_glmm, model = "GLMM"),
mutate(madras_gee, model = "GEE")
)
# Visualize differences in trajectories across model types
ggplot(madras_agg, aes(x = month, y = prob)) +
geom_line(aes(color = sex, linetype = model)) +
facet_wrap(~ age) +
theme_bw() +
xlab("Month") +
ylab("Probability")
Chapter 1 - Introduction to data.table
Introduction:
Filtering Rows in a data.table:
Helpers for filtering:
Example code includes:
# Load data.table
library(data.table)
# Create the data.table X
X <- data.table(id = c("a", "b", "c"), value = c(0.5, 1.0, 1.5))
# View X
X
## id value
## 1: a 0.5
## 2: b 1.0
## 3: c 1.5
data(batrips, package="bikeshare14")
batrips <- as.data.table(batrips)
# Get number of columns in batrips
col_number <- ncol(batrips)
# Print the first 8 rows
head(batrips, 8)
## trip_id duration start_date start_station start_terminal
## 1: 139545 435 2014-01-01 00:14:00 San Francisco City Hall 58
## 2: 139546 432 2014-01-01 00:14:00 San Francisco City Hall 58
## 3: 139547 1523 2014-01-01 00:17:00 Embarcadero at Sansome 60
## 4: 139549 1620 2014-01-01 00:23:00 Steuart at Market 74
## 5: 139550 1617 2014-01-01 00:23:00 Steuart at Market 74
## 6: 139551 779 2014-01-01 00:24:00 Steuart at Market 74
## 7: 139552 784 2014-01-01 00:24:00 Steuart at Market 74
## 8: 139553 721 2014-01-01 00:25:00 Steuart at Market 74
## end_date end_station end_terminal bike_id
## 1: 2014-01-01 00:21:00 Townsend at 7th 65 473
## 2: 2014-01-01 00:21:00 Townsend at 7th 65 395
## 3: 2014-01-01 00:42:00 Beale at Market 56 331
## 4: 2014-01-01 00:50:00 Powell Street BART 39 605
## 5: 2014-01-01 00:50:00 Powell Street BART 39 453
## 6: 2014-01-01 00:37:00 Washington at Kearney 46 335
## 7: 2014-01-01 00:37:00 Washington at Kearney 46 580
## 8: 2014-01-01 00:37:00 Washington at Kearney 46 563
## subscription_type zip_code
## 1: Subscriber 94612
## 2: Subscriber 94107
## 3: Subscriber 94112
## 4: Customer 92007
## 5: Customer 92007
## 6: Customer 94109
## 7: Customer
## 8: Customer 94109
# Print the last 8 rows
tail(batrips, 8)
## trip_id duration start_date start_station
## 1: 588907 770 2014-12-31 22:51:00 Townsend at 7th
## 2: 588909 992 2014-12-31 23:06:00 Washington at Kearny
## 3: 588908 1004 2014-12-31 23:06:00 Washington at Kearny
## 4: 588910 437 2014-12-31 23:18:00 Powell Street BART
## 5: 588911 422 2014-12-31 23:19:00 Grant Avenue at Columbus Avenue
## 6: 588912 1487 2014-12-31 23:31:00 South Van Ness at Market
## 7: 588913 1458 2014-12-31 23:32:00 South Van Ness at Market
## 8: 588914 364 2014-12-31 23:33:00 Embarcadero at Bryant
## start_terminal end_date
## 1: 65 2014-12-31 23:04:00
## 2: 46 2014-12-31 23:23:00
## 3: 46 2014-12-31 23:23:00
## 4: 39 2014-12-31 23:25:00
## 5: 73 2014-12-31 23:26:00
## 6: 66 2014-12-31 23:56:00
## 7: 66 2014-12-31 23:56:00
## 8: 54 2014-12-31 23:40:00
## end_station end_terminal bike_id
## 1: Howard at 2nd 63 677
## 2: Embarcadero at Vallejo 48 485
## 3: Embarcadero at Vallejo 48 419
## 4: San Francisco Caltrain (Townsend at 4th) 70 573
## 5: Yerba Buena Center of the Arts (3rd @ Howard) 68 604
## 6: Steuart at Market 74 480
## 7: Steuart at Market 74 277
## 8: Howard at 2nd 63 56
## subscription_type zip_code
## 1: Subscriber 94107
## 2: Customer 92104
## 3: Customer 92104
## 4: Subscriber 95050
## 5: Subscriber 94133
## 6: Customer 94109
## 7: Customer 94109
## 8: Subscriber 94105
# Print the structure of batrips
str(batrips)
## Classes 'data.table' and 'data.frame': 326339 obs. of 11 variables:
## $ trip_id : int 139545 139546 139547 139549 139550 139551 139552 139553 139554 139555 ...
## $ duration : int 435 432 1523 1620 1617 779 784 721 624 574 ...
## $ start_date : POSIXct, format: "2014-01-01 00:14:00" "2014-01-01 00:14:00" ...
## $ start_station : chr "San Francisco City Hall" "San Francisco City Hall" "Embarcadero at Sansome" "Steuart at Market" ...
## $ start_terminal : int 58 58 60 74 74 74 74 74 57 57 ...
## $ end_date : POSIXct, format: "2014-01-01 00:21:00" "2014-01-01 00:21:00" ...
## $ end_station : chr "Townsend at 7th" "Townsend at 7th" "Beale at Market" "Powell Street BART" ...
## $ end_terminal : int 65 65 56 39 39 46 46 46 68 68 ...
## $ bike_id : int 473 395 331 605 453 335 580 563 358 365 ...
## $ subscription_type: chr "Subscriber" "Subscriber" "Subscriber" "Customer" ...
## $ zip_code : chr "94612" "94107" "94112" "92007" ...
## - attr(*, ".internal.selfref")=<externalptr>
# Filter third row
row_3 <- batrips[3]
row_3
## trip_id duration start_date start_station start_terminal
## 1: 139547 1523 2014-01-01 00:17:00 Embarcadero at Sansome 60
## end_date end_station end_terminal bike_id subscription_type
## 1: 2014-01-01 00:42:00 Beale at Market 56 331 Subscriber
## zip_code
## 1: 94112
# Filter rows 10 through 20
rows_10_20 <- batrips[10:20]
rows_10_20
## trip_id duration start_date start_station
## 1: 139555 574 2014-01-01 00:25:00 5th at Howard
## 2: 139558 1600 2014-01-01 00:28:00 Harry Bridges Plaza (Ferry Building)
## 3: 139559 3691 2014-01-01 00:32:00 Steuart at Market
## 4: 139560 3793 2014-01-01 00:32:00 Steuart at Market
## 5: 139561 3788 2014-01-01 00:32:00 Steuart at Market
## 6: 139562 3626 2014-01-01 00:33:00 Steuart at Market
## 7: 139563 805 2014-01-01 00:33:00 Steuart at Market
## 8: 139564 769 2014-01-01 00:34:00 Steuart at Market
## 9: 139565 1157 2014-01-01 00:36:00 Steuart at Market
## 10: 139566 1159 2014-01-01 00:37:00 Steuart at Market
## 11: 139567 477 2014-01-01 00:39:00 Beale at Market
## start_terminal end_date
## 1: 57 2014-01-01 00:35:00
## 2: 50 2014-01-01 00:54:00
## 3: 74 2014-01-01 01:33:00
## 4: 74 2014-01-01 01:35:00
## 5: 74 2014-01-01 01:35:00
## 6: 74 2014-01-01 01:33:00
## 7: 74 2014-01-01 00:47:00
## 8: 74 2014-01-01 00:47:00
## 9: 74 2014-01-01 00:56:00
## 10: 74 2014-01-01 00:56:00
## 11: 56 2014-01-01 00:47:00
## end_station end_terminal bike_id
## 1: Yerba Buena Center of the Arts (3rd @ Howard) 68 365
## 2: Steuart at Market 74 413
## 3: Steuart at Market 74 619
## 4: Steuart at Market 74 311
## 5: Steuart at Market 74 577
## 6: Steuart at Market 74 271
## 7: Yerba Buena Center of the Arts (3rd @ Howard) 68 508
## 8: Yerba Buena Center of the Arts (3rd @ Howard) 68 384
## 9: Civic Center BART (7th at Market) 72 276
## 10: Civic Center BART (7th at Market) 72 423
## 11: Spear at Folsom 49 265
## subscription_type zip_code
## 1: Customer 94941
## 2: Subscriber 94102
## 3: Customer 94070
## 4: Customer 55417
## 5: Customer 55417
## 6: Customer 94070
## 7: Customer
## 8: Customer
## 9: Customer
## 10: Customer
## 11: Customer 94105
# Filter the 1st, 6th and 10th rows
rows_1_6_10 <- batrips[c(1, 6, 10)]
rows_1_6_10
## trip_id duration start_date start_station start_terminal
## 1: 139545 435 2014-01-01 00:14:00 San Francisco City Hall 58
## 2: 139551 779 2014-01-01 00:24:00 Steuart at Market 74
## 3: 139555 574 2014-01-01 00:25:00 5th at Howard 57
## end_date end_station
## 1: 2014-01-01 00:21:00 Townsend at 7th
## 2: 2014-01-01 00:37:00 Washington at Kearney
## 3: 2014-01-01 00:35:00 Yerba Buena Center of the Arts (3rd @ Howard)
## end_terminal bike_id subscription_type zip_code
## 1: 65 473 Subscriber 94612
## 2: 46 335 Customer 94109
## 3: 68 365 Customer 94941
# Select all rows except the first two
not_first_two <- batrips[-c(1:2)]
not_first_two
## trip_id duration start_date start_station
## 1: 139547 1523 2014-01-01 00:17:00 Embarcadero at Sansome
## 2: 139549 1620 2014-01-01 00:23:00 Steuart at Market
## 3: 139550 1617 2014-01-01 00:23:00 Steuart at Market
## 4: 139551 779 2014-01-01 00:24:00 Steuart at Market
## 5: 139552 784 2014-01-01 00:24:00 Steuart at Market
## ---
## 326333: 588910 437 2014-12-31 23:18:00 Powell Street BART
## 326334: 588911 422 2014-12-31 23:19:00 Grant Avenue at Columbus Avenue
## 326335: 588912 1487 2014-12-31 23:31:00 South Van Ness at Market
## 326336: 588913 1458 2014-12-31 23:32:00 South Van Ness at Market
## 326337: 588914 364 2014-12-31 23:33:00 Embarcadero at Bryant
## start_terminal end_date
## 1: 60 2014-01-01 00:42:00
## 2: 74 2014-01-01 00:50:00
## 3: 74 2014-01-01 00:50:00
## 4: 74 2014-01-01 00:37:00
## 5: 74 2014-01-01 00:37:00
## ---
## 326333: 39 2014-12-31 23:25:00
## 326334: 73 2014-12-31 23:26:00
## 326335: 66 2014-12-31 23:56:00
## 326336: 66 2014-12-31 23:56:00
## 326337: 54 2014-12-31 23:40:00
## end_station end_terminal bike_id
## 1: Beale at Market 56 331
## 2: Powell Street BART 39 605
## 3: Powell Street BART 39 453
## 4: Washington at Kearney 46 335
## 5: Washington at Kearney 46 580
## ---
## 326333: San Francisco Caltrain (Townsend at 4th) 70 573
## 326334: Yerba Buena Center of the Arts (3rd @ Howard) 68 604
## 326335: Steuart at Market 74 480
## 326336: Steuart at Market 74 277
## 326337: Howard at 2nd 63 56
## subscription_type zip_code
## 1: Subscriber 94112
## 2: Customer 92007
## 3: Customer 92007
## 4: Customer 94109
## 5: Customer
## ---
## 326333: Subscriber 95050
## 326334: Subscriber 94133
## 326335: Customer 94109
## 326336: Customer 94109
## 326337: Subscriber 94105
# Select all rows except 1 through 5 and 10 through 15
exclude_some <- batrips[-c(1:5, 10:15)]
exclude_some
## trip_id duration start_date start_station
## 1: 139551 779 2014-01-01 00:24:00 Steuart at Market
## 2: 139552 784 2014-01-01 00:24:00 Steuart at Market
## 3: 139553 721 2014-01-01 00:25:00 Steuart at Market
## 4: 139554 624 2014-01-01 00:25:00 5th at Howard
## 5: 139563 805 2014-01-01 00:33:00 Steuart at Market
## ---
## 326324: 588910 437 2014-12-31 23:18:00 Powell Street BART
## 326325: 588911 422 2014-12-31 23:19:00 Grant Avenue at Columbus Avenue
## 326326: 588912 1487 2014-12-31 23:31:00 South Van Ness at Market
## 326327: 588913 1458 2014-12-31 23:32:00 South Van Ness at Market
## 326328: 588914 364 2014-12-31 23:33:00 Embarcadero at Bryant
## start_terminal end_date
## 1: 74 2014-01-01 00:37:00
## 2: 74 2014-01-01 00:37:00
## 3: 74 2014-01-01 00:37:00
## 4: 57 2014-01-01 00:35:00
## 5: 74 2014-01-01 00:47:00
## ---
## 326324: 39 2014-12-31 23:25:00
## 326325: 73 2014-12-31 23:26:00
## 326326: 66 2014-12-31 23:56:00
## 326327: 66 2014-12-31 23:56:00
## 326328: 54 2014-12-31 23:40:00
## end_station end_terminal bike_id
## 1: Washington at Kearney 46 335
## 2: Washington at Kearney 46 580
## 3: Washington at Kearney 46 563
## 4: Yerba Buena Center of the Arts (3rd @ Howard) 68 358
## 5: Yerba Buena Center of the Arts (3rd @ Howard) 68 508
## ---
## 326324: San Francisco Caltrain (Townsend at 4th) 70 573
## 326325: Yerba Buena Center of the Arts (3rd @ Howard) 68 604
## 326326: Steuart at Market 74 480
## 326327: Steuart at Market 74 277
## 326328: Howard at 2nd 63 56
## subscription_type zip_code
## 1: Customer 94109
## 2: Customer
## 3: Customer 94109
## 4: Customer 94941
## 5: Customer
## ---
## 326324: Subscriber 95050
## 326325: Subscriber 94133
## 326326: Customer 94109
## 326327: Customer 94109
## 326328: Subscriber 94105
# Select all rows except the first and last
not_first_last <- batrips[-c(1, .N)]
not_first_last
## trip_id duration start_date start_station
## 1: 139546 432 2014-01-01 00:14:00 San Francisco City Hall
## 2: 139547 1523 2014-01-01 00:17:00 Embarcadero at Sansome
## 3: 139549 1620 2014-01-01 00:23:00 Steuart at Market
## 4: 139550 1617 2014-01-01 00:23:00 Steuart at Market
## 5: 139551 779 2014-01-01 00:24:00 Steuart at Market
## ---
## 326333: 588908 1004 2014-12-31 23:06:00 Washington at Kearny
## 326334: 588910 437 2014-12-31 23:18:00 Powell Street BART
## 326335: 588911 422 2014-12-31 23:19:00 Grant Avenue at Columbus Avenue
## 326336: 588912 1487 2014-12-31 23:31:00 South Van Ness at Market
## 326337: 588913 1458 2014-12-31 23:32:00 South Van Ness at Market
## start_terminal end_date
## 1: 58 2014-01-01 00:21:00
## 2: 60 2014-01-01 00:42:00
## 3: 74 2014-01-01 00:50:00
## 4: 74 2014-01-01 00:50:00
## 5: 74 2014-01-01 00:37:00
## ---
## 326333: 46 2014-12-31 23:23:00
## 326334: 39 2014-12-31 23:25:00
## 326335: 73 2014-12-31 23:26:00
## 326336: 66 2014-12-31 23:56:00
## 326337: 66 2014-12-31 23:56:00
## end_station end_terminal bike_id
## 1: Townsend at 7th 65 395
## 2: Beale at Market 56 331
## 3: Powell Street BART 39 605
## 4: Powell Street BART 39 453
## 5: Washington at Kearney 46 335
## ---
## 326333: Embarcadero at Vallejo 48 419
## 326334: San Francisco Caltrain (Townsend at 4th) 70 573
## 326335: Yerba Buena Center of the Arts (3rd @ Howard) 68 604
## 326336: Steuart at Market 74 480
## 326337: Steuart at Market 74 277
## subscription_type zip_code
## 1: Subscriber 94107
## 2: Subscriber 94112
## 3: Customer 92007
## 4: Customer 92007
## 5: Customer 94109
## ---
## 326333: Customer 92104
## 326334: Subscriber 95050
## 326335: Subscriber 94133
## 326336: Customer 94109
## 326337: Customer 94109
# Filter all rows where start_station is "MLK Library"
trips_mlk <- batrips[start_station == "MLK Library"]
trips_mlk
## trip_id duration start_date start_station start_terminal
## 1: 140683 426 2014-01-02 16:07:00 MLK Library 11
## 2: 140787 333 2014-01-02 16:51:00 MLK Library 11
## 3: 141476 467 2014-01-03 10:08:00 MLK Library 11
## 4: 141641 246 2014-01-03 12:32:00 MLK Library 11
## 5: 141830 380 2014-01-03 15:56:00 MLK Library 11
## ---
## 831: 588367 34719 2014-12-31 01:08:00 MLK Library 11
## 832: 588366 34715 2014-12-31 01:08:00 MLK Library 11
## 833: 588368 34595 2014-12-31 01:10:00 MLK Library 11
## 834: 588429 692 2014-12-31 07:29:00 MLK Library 11
## 835: 588430 599 2014-12-31 07:30:00 MLK Library 11
## end_date end_station end_terminal bike_id
## 1: 2014-01-02 16:14:00 San Jose Diridon Caltrain Station 2 657
## 2: 2014-01-02 16:56:00 Adobe on Almaden 5 140
## 3: 2014-01-03 10:15:00 Paseo de San Antonio 7 182
## 4: 2014-01-03 12:37:00 San Salvador at 1st 8 665
## 5: 2014-01-03 16:03:00 San Jose Diridon Caltrain Station 2 176
## ---
## 831: 2014-12-31 10:47:00 MLK Library 11 128
## 832: 2014-12-31 10:47:00 MLK Library 11 667
## 833: 2014-12-31 10:47:00 MLK Library 11 241
## 834: 2014-12-31 07:40:00 San Jose Diridon Caltrain Station 2 180
## 835: 2014-12-31 07:40:00 San Jose Diridon Caltrain Station 2 682
## subscription_type zip_code
## 1: Subscriber 94043
## 2: Subscriber 94536
## 3: Subscriber 95035
## 4: Subscriber 95112
## 5: Subscriber 94043
## ---
## 831: Customer 95112
## 832: Customer 95112
## 833: Customer 95076
## 834: Subscriber 95112
## 835: Subscriber 95113
# Filter all rows where start_station is "MLK Library" AND duration > 1600
trips_mlk_1600 <- batrips[start_station == "MLK Library" & duration > 1600]
trips_mlk_1600
## trip_id duration start_date start_station start_terminal
## 1: 147733 1744 2014-01-09 11:47:00 MLK Library 11
## 2: 158900 61848 2014-01-19 16:42:00 MLK Library 11
## 3: 159736 1665 2014-01-20 18:38:00 MLK Library 11
## 4: 159737 1671 2014-01-20 18:38:00 MLK Library 11
## 5: 166370 6125 2014-01-26 14:12:00 MLK Library 11
## 6: 166371 6039 2014-01-26 14:13:00 MLK Library 11
## 7: 171845 4629 2014-01-31 01:07:00 MLK Library 11
## 8: 176588 1781 2014-02-05 09:03:00 MLK Library 11
## 9: 180917 68778 2014-02-11 15:16:00 MLK Library 11
## 10: 207319 1615 2014-03-09 17:04:00 MLK Library 11
## 11: 207641 4490 2014-03-10 08:06:00 MLK Library 11
## 12: 215277 1652 2014-03-15 15:36:00 MLK Library 11
## 13: 215829 26719 2014-03-16 14:09:00 MLK Library 11
## 14: 215835 26606 2014-03-16 14:11:00 MLK Library 11
## 15: 224012 3553 2014-03-22 19:16:00 MLK Library 11
## 16: 224013 3204 2014-03-22 19:22:00 MLK Library 11
## 17: 224304 9506 2014-03-23 14:37:00 MLK Library 11
## 18: 224305 9362 2014-03-23 14:40:00 MLK Library 11
## 19: 231184 2336 2014-03-30 08:48:00 MLK Library 11
## 20: 231461 1839 2014-03-30 15:34:00 MLK Library 11
## 21: 233561 5145 2014-04-02 08:01:00 MLK Library 11
## 22: 237605 6406 2014-04-05 14:23:00 MLK Library 11
## 23: 238354 7041 2014-04-06 18:26:00 MLK Library 11
## 24: 238964 2474 2014-04-07 10:50:00 MLK Library 11
## 25: 243981 22350 2014-04-10 16:34:00 MLK Library 11
## 26: 244441 38525 2014-04-11 00:45:00 MLK Library 11
## 27: 245171 1727 2014-04-11 15:07:00 MLK Library 11
## 28: 245173 1682 2014-04-11 15:08:00 MLK Library 11
## 29: 245208 2245 2014-04-11 15:39:00 MLK Library 11
## 30: 245212 2149 2014-04-11 15:40:00 MLK Library 11
## 31: 245977 25692 2014-04-12 13:47:00 MLK Library 11
## 32: 258870 6557 2014-04-23 16:43:00 MLK Library 11
## 33: 262804 3582 2014-04-28 08:17:00 MLK Library 11
## 34: 271145 1631 2014-05-04 15:19:00 MLK Library 11
## 35: 275500 168395 2014-05-07 15:47:00 MLK Library 11
## 36: 275926 5390 2014-05-07 18:45:00 MLK Library 11
## 37: 275928 5332 2014-05-07 18:46:00 MLK Library 11
## 38: 276189 1705 2014-05-08 06:57:00 MLK Library 11
## 39: 278974 11620 2014-05-10 09:47:00 MLK Library 11
## 40: 286203 2105 2014-05-15 19:43:00 MLK Library 11
## 41: 287783 18184 2014-05-16 20:07:00 MLK Library 11
## 42: 302706 8035 2014-05-29 12:23:00 MLK Library 11
## 43: 302712 7860 2014-05-29 12:25:00 MLK Library 11
## 44: 305126 4874 2014-05-31 11:20:00 MLK Library 11
## 45: 305130 9338 2014-05-31 11:22:00 MLK Library 11
## 46: 315821 15235 2014-06-08 21:22:00 MLK Library 11
## 47: 316695 4403 2014-06-09 13:23:00 MLK Library 11
## 48: 316694 4398 2014-06-09 13:23:00 MLK Library 11
## 49: 324444 1723 2014-06-14 16:21:00 MLK Library 11
## 50: 324446 1700 2014-06-14 16:22:00 MLK Library 11
## 51: 325172 2964 2014-06-15 19:53:00 MLK Library 11
## 52: 325174 2796 2014-06-15 19:56:00 MLK Library 11
## 53: 326021 2477 2014-06-16 12:35:00 MLK Library 11
## 54: 329525 6966 2014-06-18 13:34:00 MLK Library 11
## 55: 331197 20671 2014-06-19 14:30:00 MLK Library 11
## 56: 331253 16797 2014-06-19 15:35:00 MLK Library 11
## 57: 331932 1913 2014-06-19 21:16:00 MLK Library 11
## 58: 336447 2626 2014-06-24 07:33:00 MLK Library 11
## 59: 339011 19511 2014-06-25 14:06:00 MLK Library 11
## 60: 339013 15690 2014-06-25 14:07:00 MLK Library 11
## 61: 345918 2579 2014-06-30 20:46:00 MLK Library 11
## 62: 346850 7419 2014-07-01 15:34:00 MLK Library 11
## 63: 351202 75126 2014-07-04 23:06:00 MLK Library 11
## 64: 370599 14793 2014-07-18 20:16:00 MLK Library 11
## 65: 391024 7450 2014-08-01 19:21:00 MLK Library 11
## 66: 391894 9001 2014-08-03 12:04:00 MLK Library 11
## 67: 407384 1920 2014-08-13 16:18:00 MLK Library 11
## 68: 407382 1928 2014-08-13 16:18:00 MLK Library 11
## 69: 410332 8303 2014-08-15 11:20:00 MLK Library 11
## 70: 410333 8210 2014-08-15 11:22:00 MLK Library 11
## 71: 410336 8066 2014-08-15 11:24:00 MLK Library 11
## 72: 411884 2431 2014-08-16 23:26:00 MLK Library 11
## 73: 412323 10611 2014-08-17 18:14:00 MLK Library 11
## 74: 427235 79197 2014-08-27 13:31:00 MLK Library 11
## 75: 428990 12149 2014-08-28 11:43:00 MLK Library 11
## 76: 428989 12148 2014-08-28 11:43:00 MLK Library 11
## 77: 461834 19167 2014-09-19 20:13:00 MLK Library 11
## 78: 461993 12590 2014-09-20 10:13:00 MLK Library 11
## 79: 468000 1711 2014-09-24 16:34:00 MLK Library 11
## 80: 469674 2859 2014-09-25 16:42:00 MLK Library 11
## 81: 476380 3250 2014-09-30 16:34:00 MLK Library 11
## 82: 495778 3015 2014-10-13 11:16:00 MLK Library 11
## 83: 505517 9848 2014-10-19 14:56:00 MLK Library 11
## 84: 506277 1812 2014-10-20 09:16:00 MLK Library 11
## 85: 509993 1985 2014-10-22 09:37:00 MLK Library 11
## 86: 514887 3298 2014-10-24 21:06:00 MLK Library 11
## 87: 534363 2229 2014-11-07 09:55:00 MLK Library 11
## 88: 569828 1691 2014-12-08 13:47:00 MLK Library 11
## 89: 588367 34719 2014-12-31 01:08:00 MLK Library 11
## 90: 588366 34715 2014-12-31 01:08:00 MLK Library 11
## 91: 588368 34595 2014-12-31 01:10:00 MLK Library 11
## trip_id duration start_date start_station start_terminal
## end_date end_station end_terminal bike_id
## 1: 2014-01-09 12:16:00 San Jose City Hall 10 691
## 2: 2014-01-20 09:52:00 San Jose Civic Center 3 86
## 3: 2014-01-20 19:06:00 MLK Library 11 175
## 4: 2014-01-20 19:06:00 MLK Library 11 262
## 5: 2014-01-26 15:54:00 MLK Library 11 11
## 6: 2014-01-26 15:54:00 MLK Library 11 308
## 7: 2014-01-31 02:24:00 SJSU - San Salvador at 9th 16 307
## 8: 2014-02-05 09:33:00 Paseo de San Antonio 7 249
## 9: 2014-02-12 10:23:00 MLK Library 11 140
## 10: 2014-03-09 17:31:00 MLK Library 11 133
## 11: 2014-03-10 09:21:00 Paseo de San Antonio 7 133
## 12: 2014-03-15 16:04:00 San Jose Civic Center 3 59
## 13: 2014-03-16 21:35:00 SJSU - San Salvador at 9th 16 690
## 14: 2014-03-16 21:35:00 SJSU - San Salvador at 9th 16 666
## 15: 2014-03-22 20:15:00 Adobe on Almaden 5 132
## 16: 2014-03-22 20:15:00 Adobe on Almaden 5 176
## 17: 2014-03-23 17:16:00 Adobe on Almaden 5 140
## 18: 2014-03-23 17:16:00 Adobe on Almaden 5 668
## 19: 2014-03-30 09:27:00 St James Park 13 175
## 20: 2014-03-30 16:04:00 MLK Library 11 199
## 21: 2014-04-02 09:27:00 Paseo de San Antonio 7 154
## 22: 2014-04-05 16:10:00 SJSU 4th at San Carlos 12 62
## 23: 2014-04-06 20:23:00 MLK Library 11 158
## 24: 2014-04-07 11:31:00 San Jose Diridon Caltrain Station 2 199
## 25: 2014-04-10 22:46:00 MLK Library 11 168
## 26: 2014-04-11 11:27:00 SJSU - San Salvador at 9th 16 10
## 27: 2014-04-11 15:36:00 MLK Library 11 112
## 28: 2014-04-11 15:36:00 MLK Library 11 187
## 29: 2014-04-11 16:16:00 San Jose Civic Center 3 176
## 30: 2014-04-11 16:16:00 San Jose Civic Center 3 112
## 31: 2014-04-12 20:55:00 MLK Library 11 247
## 32: 2014-04-23 18:32:00 Santa Clara County Civic Center 80 716
## 33: 2014-04-28 09:17:00 Paseo de San Antonio 7 130
## 34: 2014-05-04 15:46:00 San Jose Civic Center 3 132
## 35: 2014-05-09 14:34:00 San Jose City Hall 10 39
## 36: 2014-05-07 20:15:00 MLK Library 11 164
## 37: 2014-05-07 20:15:00 MLK Library 11 109
## 38: 2014-05-08 07:26:00 San Jose City Hall 10 257
## 39: 2014-05-10 13:00:00 MLK Library 11 109
## 40: 2014-05-15 20:18:00 San Pedro Square 6 658
## 41: 2014-05-17 01:10:00 MLK Library 11 691
## 42: 2014-05-29 14:37:00 MLK Library 11 227
## 43: 2014-05-29 14:36:00 MLK Library 11 15
## 44: 2014-05-31 12:41:00 SJSU 4th at San Carlos 12 122
## 45: 2014-05-31 13:58:00 SJSU 4th at San Carlos 12 227
## 46: 2014-06-09 01:36:00 Adobe on Almaden 5 711
## 47: 2014-06-09 14:36:00 SJSU - San Salvador at 9th 16 30
## 48: 2014-06-09 14:36:00 SJSU - San Salvador at 9th 16 227
## 49: 2014-06-14 16:50:00 San Jose Diridon Caltrain Station 2 150
## 50: 2014-06-14 16:50:00 San Jose Diridon Caltrain Station 2 164
## 51: 2014-06-15 20:42:00 MLK Library 11 715
## 52: 2014-06-15 20:43:00 MLK Library 11 646
## 53: 2014-06-16 13:16:00 San Jose City Hall 10 709
## 54: 2014-06-18 15:30:00 MLK Library 11 45
## 55: 2014-06-19 20:15:00 SJSU - San Salvador at 9th 16 258
## 56: 2014-06-19 20:15:00 SJSU - San Salvador at 9th 16 646
## 57: 2014-06-19 21:48:00 SJSU - San Salvador at 9th 16 646
## 58: 2014-06-24 08:16:00 MLK Library 11 96
## 59: 2014-06-25 19:31:00 San Jose Civic Center 3 62
## 60: 2014-06-25 18:28:00 San Jose Civic Center 3 711
## 61: 2014-06-30 21:29:00 San Salvador at 1st 8 128
## 62: 2014-07-01 17:38:00 San Jose Diridon Caltrain Station 2 188
## 63: 2014-07-05 19:58:00 MLK Library 11 81
## 64: 2014-07-19 00:22:00 SJSU 4th at San Carlos 12 699
## 65: 2014-08-01 21:25:00 San Jose Diridon Caltrain Station 2 81
## 66: 2014-08-03 14:34:00 MLK Library 11 41
## 67: 2014-08-13 16:50:00 San Jose City Hall 10 255
## 68: 2014-08-13 16:50:00 San Jose City Hall 10 648
## 69: 2014-08-15 13:39:00 MLK Library 11 186
## 70: 2014-08-15 13:39:00 MLK Library 11 223
## 71: 2014-08-15 13:39:00 MLK Library 11 256
## 72: 2014-08-17 00:07:00 MLK Library 11 23
## 73: 2014-08-17 21:11:00 Paseo de San Antonio 7 678
## 74: 2014-08-28 11:31:00 MLK Library 11 128
## 75: 2014-08-28 15:06:00 MLK Library 11 11
## 76: 2014-08-28 15:06:00 MLK Library 11 93
## 77: 2014-09-20 01:33:00 MLK Library 11 294
## 78: 2014-09-20 13:43:00 Paseo de San Antonio 7 298
## 79: 2014-09-24 17:02:00 MLK Library 11 305
## 80: 2014-09-25 17:29:00 San Jose City Hall 10 197
## 81: 2014-09-30 17:28:00 San Jose City Hall 10 161
## 82: 2014-10-13 12:07:00 Japantown 9 205
## 83: 2014-10-19 17:40:00 MLK Library 11 30
## 84: 2014-10-20 09:46:00 Japantown 9 107
## 85: 2014-10-22 10:10:00 Japantown 9 301
## 86: 2014-10-24 22:01:00 SJSU - San Salvador at 9th 16 706
## 87: 2014-11-07 10:32:00 Japantown 9 51
## 88: 2014-12-08 14:15:00 Japantown 9 253
## 89: 2014-12-31 10:47:00 MLK Library 11 128
## 90: 2014-12-31 10:47:00 MLK Library 11 667
## 91: 2014-12-31 10:47:00 MLK Library 11 241
## end_date end_station end_terminal bike_id
## subscription_type zip_code
## 1: Subscriber 95112
## 2: Customer 95608
## 3: Customer 95166
## 4: Customer 95166
## 5: Customer 95117
## 6: Customer 95117
## 7: Customer 94510
## 8: Subscriber 95126
## 9: Subscriber 95112
## 10: Customer 95112
## 11: Subscriber 95126
## 12: Customer 92646
## 13: Customer 90278
## 14: Customer 95112
## 15: Customer 95122
## 16: Customer 95122
## 17: Customer <NA>
## 18: Customer 95112
## 19: Customer 60613
## 20: Customer <NA>
## 21: Subscriber 95126
## 22: Subscriber 95126
## 23: Customer 95112
## 24: Subscriber 95112
## 25: Customer 9539
## 26: Customer 95131
## 27: Customer <NA>
## 28: Customer <NA>
## 29: Customer <NA>
## 30: Customer <NA>
## 31: Customer 95112
## 32: Customer 95054
## 33: Subscriber 95126
## 34: Customer 95123
## 35: Customer 6473
## 36: Customer <NA>
## 37: Customer <NA>
## 38: Subscriber 94041
## 39: Customer 95125
## 40: Customer 94089
## 41: Customer 195127
## 42: Customer 95112
## 43: Customer 95112
## 44: Customer 95476
## 45: Customer 95476
## 46: Customer 80526
## 47: Customer 95116
## 48: Customer 95116
## 49: Customer 95112
## 50: Customer 95112
## 51: Customer 95123
## 52: Customer 95123
## 53: Subscriber 94010
## 54: Customer 3102
## 55: Customer 94704
## 56: Customer 94708
## 57: Customer 94708
## 58: Customer 28801
## 59: Customer <NA>
## 60: Customer 95113
## 61: Customer 95133
## 62: Subscriber 94002
## 63: Customer <NA>
## 64: Customer 95037
## 65: Customer 94606
## 66: Customer 95112
## 67: Customer 95110
## 68: Customer 95110
## 69: Customer 95139
## 70: Customer 95139
## 71: Customer 95139
## 72: Subscriber 95112
## 73: Customer <NA>
## 74: Subscriber 95112
## 75: Customer 90621
## 76: Customer 90621
## 77: Customer 95116
## 78: Customer 92105
## 79: Customer 95112
## 80: Customer 95112
## 81: Customer 95112
## 82: Subscriber 95122
## 83: Customer 889503
## 84: Subscriber 95122
## 85: Subscriber 95122
## 86: Subscriber 95122
## 87: Subscriber 95122
## 88: Subscriber 95122
## 89: Customer 95112
## 90: Customer 95112
## 91: Customer 95076
## subscription_type zip_code
# Filter all rows where `subscription_type` is not `"Subscriber"`
customers <- batrips[subscription_type != "Subscriber"]
customers
## trip_id duration start_date start_station
## 1: 139549 1620 2014-01-01 00:23:00 Steuart at Market
## 2: 139550 1617 2014-01-01 00:23:00 Steuart at Market
## 3: 139551 779 2014-01-01 00:24:00 Steuart at Market
## 4: 139552 784 2014-01-01 00:24:00 Steuart at Market
## 5: 139553 721 2014-01-01 00:25:00 Steuart at Market
## ---
## 48572: 588900 1428 2014-12-31 22:06:00 Embarcadero at Bryant
## 48573: 588909 992 2014-12-31 23:06:00 Washington at Kearny
## 48574: 588908 1004 2014-12-31 23:06:00 Washington at Kearny
## 48575: 588912 1487 2014-12-31 23:31:00 South Van Ness at Market
## 48576: 588913 1458 2014-12-31 23:32:00 South Van Ness at Market
## start_terminal end_date end_station end_terminal
## 1: 74 2014-01-01 00:50:00 Powell Street BART 39
## 2: 74 2014-01-01 00:50:00 Powell Street BART 39
## 3: 74 2014-01-01 00:37:00 Washington at Kearney 46
## 4: 74 2014-01-01 00:37:00 Washington at Kearney 46
## 5: 74 2014-01-01 00:37:00 Washington at Kearney 46
## ---
## 48572: 54 2014-12-31 22:30:00 Embarcadero at Folsom 51
## 48573: 46 2014-12-31 23:23:00 Embarcadero at Vallejo 48
## 48574: 46 2014-12-31 23:23:00 Embarcadero at Vallejo 48
## 48575: 66 2014-12-31 23:56:00 Steuart at Market 74
## 48576: 66 2014-12-31 23:56:00 Steuart at Market 74
## bike_id subscription_type zip_code
## 1: 605 Customer 92007
## 2: 453 Customer 92007
## 3: 335 Customer 94109
## 4: 580 Customer
## 5: 563 Customer 94109
## ---
## 48572: 502 Customer 94587
## 48573: 485 Customer 92104
## 48574: 419 Customer 92104
## 48575: 480 Customer 94109
## 48576: 277 Customer 94109
# Filter all rows where start_station is "Ryland Park" AND subscription_type is not "Customer"
ryland_park_subscribers <- batrips[start_station=="Ryland Park" & subscription_type != "Customer"]
ryland_park_subscribers
## trip_id duration start_date start_station start_terminal
## 1: 243456 330 2014-04-10 09:10:00 Ryland Park 84
## 2: 244497 594 2014-04-11 07:28:00 Ryland Park 84
## 3: 245067 265 2014-04-11 13:38:00 Ryland Park 84
## 4: 246212 522 2014-04-12 17:25:00 Ryland Park 84
## 5: 246933 493 2014-04-14 07:30:00 Ryland Park 84
## ---
## 936: 587254 493 2014-12-29 13:27:00 Ryland Park 84
## 937: 587827 619 2014-12-30 08:35:00 Ryland Park 84
## 938: 588063 489 2014-12-30 13:29:00 Ryland Park 84
## 939: 588442 423 2014-12-31 07:48:00 Ryland Park 84
## 940: 588447 536 2014-12-31 07:56:00 Ryland Park 84
## end_date end_station end_terminal bike_id
## 1: 2014-04-10 09:16:00 Japantown 9 23
## 2: 2014-04-11 07:38:00 San Jose Diridon Caltrain Station 2 54
## 3: 2014-04-11 13:43:00 San Pedro Square 6 62
## 4: 2014-04-12 17:34:00 San Salvador at 1st 8 144
## 5: 2014-04-14 07:38:00 San Jose Diridon Caltrain Station 2 643
## ---
## 936: 2014-12-29 13:36:00 Santa Clara County Civic Center 80 126
## 937: 2014-12-30 08:45:00 Santa Clara County Civic Center 80 126
## 938: 2014-12-30 13:37:00 Santa Clara County Civic Center 80 126
## 939: 2014-12-31 07:55:00 San Jose Civic Center 3 710
## 940: 2014-12-31 08:05:00 Santa Clara County Civic Center 80 75
## subscription_type zip_code
## 1: Subscriber 95110
## 2: Subscriber 95110
## 3: Subscriber 95110
## 4: Subscriber 95110
## 5: Subscriber 95110
## ---
## 936: Subscriber 95112
## 937: Subscriber 95112
## 938: Subscriber 95112
## 939: Subscriber 5112
## 940: Subscriber 95112
# Filter all rows where end_station contains "Market"
any_markets <- batrips[end_station %like% "Market"]
any_markets
## trip_id duration start_date
## 1: 139547 1523 2014-01-01 00:17:00
## 2: 139558 1600 2014-01-01 00:28:00
## 3: 139559 3691 2014-01-01 00:32:00
## 4: 139560 3793 2014-01-01 00:32:00
## 5: 139561 3788 2014-01-01 00:32:00
## ---
## 69120: 588882 1714 2014-12-31 19:14:00
## 69121: 588888 726 2014-12-31 20:05:00
## 69122: 588901 531 2014-12-31 22:08:00
## 69123: 588912 1487 2014-12-31 23:31:00
## 69124: 588913 1458 2014-12-31 23:32:00
## start_station start_terminal end_date
## 1: Embarcadero at Sansome 60 2014-01-01 00:42:00
## 2: Harry Bridges Plaza (Ferry Building) 50 2014-01-01 00:54:00
## 3: Steuart at Market 74 2014-01-01 01:33:00
## 4: Steuart at Market 74 2014-01-01 01:35:00
## 5: Steuart at Market 74 2014-01-01 01:35:00
## ---
## 69120: Embarcadero at Sansome 60 2014-12-31 19:43:00
## 69121: Powell Street BART 39 2014-12-31 20:17:00
## 69122: Embarcadero at Sansome 60 2014-12-31 22:17:00
## 69123: South Van Ness at Market 66 2014-12-31 23:56:00
## 69124: South Van Ness at Market 66 2014-12-31 23:56:00
## end_station end_terminal bike_id subscription_type
## 1: Beale at Market 56 331 Subscriber
## 2: Steuart at Market 74 413 Subscriber
## 3: Steuart at Market 74 619 Customer
## 4: Steuart at Market 74 311 Customer
## 5: Steuart at Market 74 577 Customer
## ---
## 69120: Market at 4th 76 328 Customer
## 69121: Civic Center BART (7th at Market) 72 475 Subscriber
## 69122: Steuart at Market 74 360 Subscriber
## 69123: Steuart at Market 74 480 Customer
## 69124: Steuart at Market 74 277 Customer
## zip_code
## 1: 94112
## 2: 94102
## 3: 94070
## 4: 55417
## 5: 55417
## ---
## 69120: <NA>
## 69121: 94112
## 69122: 94107
## 69123: 94109
## 69124: 94109
# Filter all rows where end_station ends with "Market"
end_markets <- batrips[end_station %like% "Market$"]
end_markets
## trip_id duration start_date
## 1: 139547 1523 2014-01-01 00:17:00
## 2: 139558 1600 2014-01-01 00:28:00
## 3: 139559 3691 2014-01-01 00:32:00
## 4: 139560 3793 2014-01-01 00:32:00
## 5: 139561 3788 2014-01-01 00:32:00
## ---
## 23352: 588841 308 2014-12-31 17:27:00
## 23353: 588855 229 2014-12-31 17:51:00
## 23354: 588901 531 2014-12-31 22:08:00
## 23355: 588912 1487 2014-12-31 23:31:00
## 23356: 588913 1458 2014-12-31 23:32:00
## start_station start_terminal end_date
## 1: Embarcadero at Sansome 60 2014-01-01 00:42:00
## 2: Harry Bridges Plaza (Ferry Building) 50 2014-01-01 00:54:00
## 3: Steuart at Market 74 2014-01-01 01:33:00
## 4: Steuart at Market 74 2014-01-01 01:35:00
## 5: Steuart at Market 74 2014-01-01 01:35:00
## ---
## 23352: Embarcadero at Sansome 60 2014-12-31 17:32:00
## 23353: Broadway St at Battery St 82 2014-12-31 17:55:00
## 23354: Embarcadero at Sansome 60 2014-12-31 22:17:00
## 23355: South Van Ness at Market 66 2014-12-31 23:56:00
## 23356: South Van Ness at Market 66 2014-12-31 23:56:00
## end_station end_terminal bike_id subscription_type zip_code
## 1: Beale at Market 56 331 Subscriber 94112
## 2: Steuart at Market 74 413 Subscriber 94102
## 3: Steuart at Market 74 619 Customer 94070
## 4: Steuart at Market 74 311 Customer 55417
## 5: Steuart at Market 74 577 Customer 55417
## ---
## 23352: Steuart at Market 74 395 Subscriber 94102
## 23353: Beale at Market 56 86 Subscriber 94131
## 23354: Steuart at Market 74 360 Subscriber 94107
## 23355: Steuart at Market 74 480 Customer 94109
## 23356: Steuart at Market 74 277 Customer 94109
# Filter all rows where trip_id is 588841, 139560, or 139562
filter_trip_ids <- batrips[trip_id %in% c(588841, 139560, 139562)]
filter_trip_ids
## trip_id duration start_date start_station start_terminal
## 1: 139560 3793 2014-01-01 00:32:00 Steuart at Market 74
## 2: 139562 3626 2014-01-01 00:33:00 Steuart at Market 74
## 3: 588841 308 2014-12-31 17:27:00 Embarcadero at Sansome 60
## end_date end_station end_terminal bike_id subscription_type
## 1: 2014-01-01 01:35:00 Steuart at Market 74 311 Customer
## 2: 2014-01-01 01:33:00 Steuart at Market 74 271 Customer
## 3: 2014-12-31 17:32:00 Steuart at Market 74 395 Subscriber
## zip_code
## 1: 55417
## 2: 94070
## 3: 94102
# Filter all rows where duration is between [5000, 6000]
duration_5k_6k <- batrips[duration %between% c(5000, 6000)]
duration_5k_6k
## trip_id duration start_date start_station
## 1: 139607 5987 2014-01-01 07:57:00 Market at Sansome
## 2: 139608 5974 2014-01-01 07:57:00 Market at Sansome
## 3: 139663 5114 2014-01-01 11:29:00 Embarcadero at Bryant
## 4: 139664 5040 2014-01-01 11:30:00 Embarcadero at Bryant
## 5: 139887 5023 2014-01-01 16:08:00 Davis at Jackson
## ---
## 1042: 586297 5425 2014-12-27 14:02:00 South Van Ness at Market
## 1043: 586673 5849 2014-12-28 14:50:00 University and Emerson
## 1044: 586707 5162 2014-12-28 15:50:00 San Antonio Shopping Center
## 1045: 586795 5886 2014-12-28 20:05:00 Powell at Post (Union Square)
## 1046: 588370 5850 2014-12-31 02:46:00 Grant Avenue at Columbus Avenue
## start_terminal end_date end_station
## 1: 77 2014-01-01 09:37:00 Grant Avenue at Columbus Avenue
## 2: 77 2014-01-01 09:37:00 Grant Avenue at Columbus Avenue
## 3: 54 2014-01-01 12:55:00 Embarcadero at Bryant
## 4: 54 2014-01-01 12:54:00 Embarcadero at Bryant
## 5: 42 2014-01-01 17:31:00 Grant Avenue at Columbus Avenue
## ---
## 1042: 66 2014-12-27 15:32:00 South Van Ness at Market
## 1043: 35 2014-12-28 16:27:00 University and Emerson
## 1044: 31 2014-12-28 17:16:00 San Antonio Caltrain Station
## 1045: 71 2014-12-28 21:43:00 Powell at Post (Union Square)
## 1046: 73 2014-12-31 04:23:00 Powell at Post (Union Square)
## end_terminal bike_id subscription_type zip_code
## 1: 73 591 Customer 75201
## 2: 73 596 Customer 75201
## 3: 54 604 Customer 94513
## 4: 54 281 Customer 94513
## 5: 73 329 Customer 89117
## ---
## 1042: 66 314 Customer 94306
## 1043: 35 254 Customer 1
## 1044: 29 680 Customer 95616
## 1045: 71 394 Customer 60616
## 1046: 71 401 Customer <NA>
# Filter all rows with specific start stations
two_stations <- batrips[start_station %chin% c("San Francisco City Hall", "Embarcadero at Sansome")]
two_stations
## trip_id duration start_date start_station
## 1: 139545 435 2014-01-01 00:14:00 San Francisco City Hall
## 2: 139546 432 2014-01-01 00:14:00 San Francisco City Hall
## 3: 139547 1523 2014-01-01 00:17:00 Embarcadero at Sansome
## 4: 139583 903 2014-01-01 01:34:00 Embarcadero at Sansome
## 5: 139584 873 2014-01-01 01:35:00 Embarcadero at Sansome
## ---
## 15020: 588881 1800 2014-12-31 19:12:00 Embarcadero at Sansome
## 15021: 588882 1714 2014-12-31 19:14:00 Embarcadero at Sansome
## 15022: 588887 274 2014-12-31 20:02:00 Embarcadero at Sansome
## 15023: 588898 979 2014-12-31 22:06:00 San Francisco City Hall
## 15024: 588901 531 2014-12-31 22:08:00 Embarcadero at Sansome
## start_terminal end_date
## 1: 58 2014-01-01 00:21:00
## 2: 58 2014-01-01 00:21:00
## 3: 60 2014-01-01 00:42:00
## 4: 60 2014-01-01 01:49:00
## 5: 60 2014-01-01 01:49:00
## ---
## 15020: 60 2014-12-31 19:42:00
## 15021: 60 2014-12-31 19:43:00
## 15022: 60 2014-12-31 20:07:00
## 15023: 58 2014-12-31 22:22:00
## 15024: 60 2014-12-31 22:17:00
## end_station end_terminal bike_id
## 1: Townsend at 7th 65 473
## 2: Townsend at 7th 65 395
## 3: Beale at Market 56 331
## 4: San Francisco Caltrain (Townsend at 4th) 70 278
## 5: San Francisco Caltrain (Townsend at 4th) 70 336
## ---
## 15020: Market at 4th 76 313
## 15021: Market at 4th 76 328
## 15022: Harry Bridges Plaza (Ferry Building) 50 358
## 15023: Clay at Battery 41 385
## 15024: Steuart at Market 74 360
## subscription_type zip_code
## 1: Subscriber 94612
## 2: Subscriber 94107
## 3: Subscriber 94112
## 4: Customer 9406
## 5: Customer 9406
## ---
## 15020: Customer <NA>
## 15021: Customer <NA>
## 15022: Subscriber 94111
## 15023: Subscriber 94111
## 15024: Subscriber 94107
Chapter 2 - Selecting and Computing on Columns
Selecting columns from a data.table:
Computing on columns the data.table way:
Advanced computations in j:
Example code includes:
# Select bike_id and trip_id using a character vector
df_way <- batrips[, c("bike_id", "trip_id")]
df_way
## bike_id trip_id
## 1: 473 139545
## 2: 395 139546
## 3: 331 139547
## 4: 605 139549
## 5: 453 139550
## ---
## 326335: 573 588910
## 326336: 604 588911
## 326337: 480 588912
## 326338: 277 588913
## 326339: 56 588914
# Select start_station and end_station cols without a character vector
dt_way <- batrips[, .(start_station, end_station)]
dt_way
## start_station
## 1: San Francisco City Hall
## 2: San Francisco City Hall
## 3: Embarcadero at Sansome
## 4: Steuart at Market
## 5: Steuart at Market
## ---
## 326335: Powell Street BART
## 326336: Grant Avenue at Columbus Avenue
## 326337: South Van Ness at Market
## 326338: South Van Ness at Market
## 326339: Embarcadero at Bryant
## end_station
## 1: Townsend at 7th
## 2: Townsend at 7th
## 3: Beale at Market
## 4: Powell Street BART
## 5: Powell Street BART
## ---
## 326335: San Francisco Caltrain (Townsend at 4th)
## 326336: Yerba Buena Center of the Arts (3rd @ Howard)
## 326337: Steuart at Market
## 326338: Steuart at Market
## 326339: Howard at 2nd
# You can also drop or deselect columns by prepending the character vector of column names with the - or ! Operators
# For e.g., dt[, -c("col1", "col2")] or dt[, !c("col1", "col2")] would both return all columns except col1 and col2
# Deselect start_terminal and end_terminal columns
drop_terminal_cols <- batrips[, -c("start_terminal", "end_terminal")]
drop_terminal_cols
## trip_id duration start_date start_station
## 1: 139545 435 2014-01-01 00:14:00 San Francisco City Hall
## 2: 139546 432 2014-01-01 00:14:00 San Francisco City Hall
## 3: 139547 1523 2014-01-01 00:17:00 Embarcadero at Sansome
## 4: 139549 1620 2014-01-01 00:23:00 Steuart at Market
## 5: 139550 1617 2014-01-01 00:23:00 Steuart at Market
## ---
## 326335: 588910 437 2014-12-31 23:18:00 Powell Street BART
## 326336: 588911 422 2014-12-31 23:19:00 Grant Avenue at Columbus Avenue
## 326337: 588912 1487 2014-12-31 23:31:00 South Van Ness at Market
## 326338: 588913 1458 2014-12-31 23:32:00 South Van Ness at Market
## 326339: 588914 364 2014-12-31 23:33:00 Embarcadero at Bryant
## end_date end_station
## 1: 2014-01-01 00:21:00 Townsend at 7th
## 2: 2014-01-01 00:21:00 Townsend at 7th
## 3: 2014-01-01 00:42:00 Beale at Market
## 4: 2014-01-01 00:50:00 Powell Street BART
## 5: 2014-01-01 00:50:00 Powell Street BART
## ---
## 326335: 2014-12-31 23:25:00 San Francisco Caltrain (Townsend at 4th)
## 326336: 2014-12-31 23:26:00 Yerba Buena Center of the Arts (3rd @ Howard)
## 326337: 2014-12-31 23:56:00 Steuart at Market
## 326338: 2014-12-31 23:56:00 Steuart at Market
## 326339: 2014-12-31 23:40:00 Howard at 2nd
## bike_id subscription_type zip_code
## 1: 473 Subscriber 94612
## 2: 395 Subscriber 94107
## 3: 331 Subscriber 94112
## 4: 605 Customer 92007
## 5: 453 Customer 92007
## ---
## 326335: 573 Subscriber 95050
## 326336: 604 Subscriber 94133
## 326337: 480 Customer 94109
## 326338: 277 Customer 94109
## 326339: 56 Subscriber 94105
# Calculate median duration using the j argument
median_duration <- batrips[, mean(duration)]
median_duration
## [1] 1132
# Get median duration after filtering
median_duration_filter <- batrips[end_station == "Market at 10th" & subscription_type == "Subscriber", median(duration)]
median_duration_filter
## [1] 651
# Compute duration of all trips
trip_duration <- batrips[, difftime(end_date, start_date, units="min")]
head(trip_duration)
## Time differences in mins
## [1] 7 7 25 27 27 13
# Have the column mean_durn
mean_duration <- batrips[, .(mean_durn=mean(duration))]
mean_duration
## mean_durn
## 1: 1132
# Get the min and max duration values
min_max_duration <- batrips[, .(min(duration), max(duration))]
min_max_duration
## V1 V2
## 1: 60 17270400
# Calculate the number of unique values
other_stats <- batrips[, .(mean_duration=mean(duration), last_ride=max(end_date))]
other_stats
## mean_duration last_ride
## 1: 1132 2015-06-24 20:18:00
duration_stats <- batrips[start_station == "Townsend at 7th" & duration < 500,
.(min_dur=min(duration), max_dur=max(duration))]
duration_stats
## min_dur max_dur
## 1: 62 499
# Plot the histogram of duration based on conditions
batrips[start_station == "Townsend at 7th" & duration < 500, hist(duration)]
## $breaks
## [1] 50 100 150 200 250 300 350 400 450 500
##
## $counts
## [1] 28 15 792 2042 920 314 314 497 538
##
## $density
## [1] 1.03e-04 5.49e-05 2.90e-03 7.48e-03 3.37e-03 1.15e-03 1.15e-03 1.82e-03
## [9] 1.97e-03
##
## $mids
## [1] 75 125 175 225 275 325 375 425 475
##
## $xname
## [1] "duration"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
Chapter 3 - Groupwise Operations
Computations by Groups:
Chaining data.table expressions:
Computations in j using .SD:
Example code includes:
# Compute the mean duration for every start_station
mean_start_stn <- batrips[, .(mean_duration=mean(duration)), by = "start_station"]
mean_start_stn
## start_station mean_duration
## 1: San Francisco City Hall 1894
## 2: Embarcadero at Sansome 1418
## 3: Steuart at Market 957
## 4: 5th at Howard 845
## 5: Harry Bridges Plaza (Ferry Building) 1516
## 6: Beale at Market 857
## 7: Embarcadero at Folsom 854
## 8: 2nd at South Park 698
## 9: Santa Clara at Almaden 954
## 10: Powell Street BART 1332
## 11: Howard at 2nd 740
## 12: 2nd at Townsend 841
## 13: South Van Ness at Market 3801
## 14: 2nd at Folsom 551
## 15: Market at 4th 1272
## 16: Market at 10th 1073
## 17: Market at Sansome 829
## 18: Embarcadero at Bryant 993
## 19: Temporary Transbay Terminal (Howard at Beale) 656
## 20: Civic Center BART (7th at Market) 1288
## 21: San Francisco Caltrain 2 (330 Townsend) 702
## 22: Grant Avenue at Columbus Avenue 1245
## 23: Paseo de San Antonio 1813
## 24: San Jose Civic Center 3554
## 25: University and Emerson 5435
## 26: Townsend at 7th 701
## 27: Embarcadero at Vallejo 1646
## 28: Washington at Kearney 2335
## 29: Spear at Folsom 740
## 30: San Francisco Caltrain (Townsend at 4th) 819
## 31: Davis at Jackson 936
## 32: Clay at Battery 1200
## 33: Golden Gate at Polk 1196
## 34: Yerba Buena Center of the Arts (3rd @ Howard) 871
## 35: Powell at Post (Union Square) 1608
## 36: San Antonio Caltrain Station 2820
## 37: Rengstorff Avenue / California Street 4363
## 38: Cowper at University 2190
## 39: Mechanics Plaza (Market at Battery) 1007
## 40: Mountain View Caltrain Station 1268
## 41: Adobe on Almaden 845
## 42: Commercial at Montgomery 796
## 43: SJSU - San Salvador at 9th 937
## 44: Post at Kearney 937
## 45: California Ave Caltrain Station 4294
## 46: St James Park 938
## 47: Mountain View City Hall 1558
## 48: San Salvador at 1st 1101
## 49: Evelyn Park and Ride 1212
## 50: San Jose Diridon Caltrain Station 857
## 51: Redwood City Caltrain Station 4221
## 52: Palo Alto Caltrain Station 3220
## 53: San Jose City Hall 1001
## 54: SJSU 4th at San Carlos 2096
## 55: Park at Olive 3705
## 56: Arena Green / SAP Center 2050
## 57: San Pedro Square 969
## 58: MLK Library 1885
## 59: Japantown 2464
## 60: Broadway at Main 3473
## 61: San Jose Government Center 1068
## 62: Castro Street and El Camino Real 1831
## 63: San Mateo County Center 4046
## 64: San Antonio Shopping Center 1373
## 65: Franklin at Maple 803
## 66: Redwood City Medical Center 1912
## 67: Redwood City Public Library 3474
## 68: Broadway St at Battery St 883
## 69: Mezes Park 771
## 70: Washington at Kearny 1886
## 71: Post at Kearny 1035
## 72: Santa Clara County Civic Center 1379
## 73: Ryland Park 1407
## 74: Stanford in Redwood City 1878
## start_station mean_duration
# Compute the mean duration for every start and end station
mean_station <- batrips[, .(mean_duration=mean(duration)), by = .(start_station, end_station)]
mean_station
## start_station end_station
## 1: San Francisco City Hall Townsend at 7th
## 2: Embarcadero at Sansome Beale at Market
## 3: Steuart at Market Powell Street BART
## 4: Steuart at Market Washington at Kearney
## 5: 5th at Howard Yerba Buena Center of the Arts (3rd @ Howard)
## ---
## 1859: Mezes Park Palo Alto Caltrain Station
## 1860: Redwood City Public Library Stanford in Redwood City
## 1861: University and Emerson Redwood City Public Library
## 1862: Franklin at Maple Cowper at University
## 1863: University and Emerson San Mateo County Center
## mean_duration
## 1: 679
## 2: 651
## 3: 884
## 4: 1553
## 5: 1811
## ---
## 1859: 2518
## 1860: 639
## 1861: 1395
## 1862: 2654
## 1863: 1918
# Compute the mean duration grouped by start_station and month
mean_start_station <- batrips[, .(mean_duration=mean(duration)), by=.(start_station, month(start_date))]
mean_start_station
## start_station month mean_duration
## 1: San Francisco City Hall 1 1548
## 2: Embarcadero at Sansome 1 952
## 3: Steuart at Market 1 757
## 4: 5th at Howard 1 599
## 5: Harry Bridges Plaza (Ferry Building) 1 1429
## ---
## 830: California Ave Caltrain Station 12 4230
## 831: University and Emerson 12 7771
## 832: SJSU - San Salvador at 9th 12 653
## 833: San Mateo County Center 12 5034
## 834: Redwood City Public Library 12 496
# Compute mean of duration and total trips grouped by start and end stations
aggregate_mean_trips <- batrips[, .(mean_duration=mean(duration), total_trips=.N), by=.(start_station, end_station)]
aggregate_mean_trips
## start_station end_station
## 1: San Francisco City Hall Townsend at 7th
## 2: Embarcadero at Sansome Beale at Market
## 3: Steuart at Market Powell Street BART
## 4: Steuart at Market Washington at Kearney
## 5: 5th at Howard Yerba Buena Center of the Arts (3rd @ Howard)
## ---
## 1859: Mezes Park Palo Alto Caltrain Station
## 1860: Redwood City Public Library Stanford in Redwood City
## 1861: University and Emerson Redwood City Public Library
## 1862: Franklin at Maple Cowper at University
## 1863: University and Emerson San Mateo County Center
## mean_duration total_trips
## 1: 679 121
## 2: 651 545
## 3: 884 145
## 4: 1553 9
## 5: 1811 59
## ---
## 1859: 2518 2
## 1860: 639 1
## 1861: 1395 1
## 1862: 2654 1
## 1863: 1918 2
# Compute min and max duration grouped by start station, end station, and month
aggregate_min_max <- batrips[, .(min_duration=min(duration), max_duration=max(duration)), by=.(start_station, end_station, month(start_date))]
aggregate_min_max
## start_station
## 1: San Francisco City Hall
## 2: Embarcadero at Sansome
## 3: Steuart at Market
## 4: Steuart at Market
## 5: 5th at Howard
## ---
## 17665: Civic Center BART (7th at Market)
## 17666: Broadway St at Battery St
## 17667: Japantown
## 17668: Arena Green / SAP Center
## 17669: Washington at Kearny
## end_station month min_duration
## 1: Townsend at 7th 1 370
## 2: Beale at Market 1 345
## 3: Powell Street BART 1 498
## 4: Washington at Kearney 1 312
## 5: Yerba Buena Center of the Arts (3rd @ Howard) 1 349
## ---
## 17665: Embarcadero at Vallejo 12 870
## 17666: Washington at Kearny 12 158
## 17667: Japantown 12 3237
## 17668: Ryland Park 12 767
## 17669: Howard at 2nd 12 915
## max_duration
## 1: 661
## 2: 1674
## 3: 1620
## 4: 784
## 5: 624
## ---
## 17665: 870
## 17666: 158
## 17667: 3249
## 17668: 767
## 17669: 965
# Arrange the total trips grouped by start_station and end_station in decreasing order
trips_dec <- batrips[, .N, by = .(start_station, end_station)][order(-N)]
trips_dec
## start_station
## 1: Townsend at 7th
## 2: San Francisco Caltrain 2 (330 Townsend)
## 3: Harry Bridges Plaza (Ferry Building)
## 4: 2nd at Townsend
## 5: Market at 10th
## ---
## 1859: Redwood City Caltrain Station
## 1860: Mezes Park
## 1861: Redwood City Public Library
## 1862: University and Emerson
## 1863: Franklin at Maple
## end_station N
## 1: San Francisco Caltrain (Townsend at 4th) 3158
## 2: Townsend at 7th 2937
## 3: Embarcadero at Sansome 2826
## 4: Harry Bridges Plaza (Ferry Building) 2330
## 5: San Francisco Caltrain (Townsend at 4th) 2131
## ---
## 1859: California Ave Caltrain Station 1
## 1860: California Ave Caltrain Station 1
## 1861: Stanford in Redwood City 1
## 1862: Redwood City Public Library 1
## 1863: Cowper at University 1
# Top five most popular destinations
top_5 <- batrips[, .N, by = .(end_station)][order(-N)][1:5]
top_5
## end_station N
## 1: San Francisco Caltrain (Townsend at 4th) 33213
## 2: Harry Bridges Plaza (Ferry Building) 15692
## 3: San Francisco Caltrain 2 (330 Townsend) 15333
## 4: Market at Sansome 14816
## 5: 2nd at Townsend 14064
# Compute most popular end station for every start station
popular_end_station <- trips_dec[, .(end_station = head(end_station, 1)), by = .(start_station)]
popular_end_station
## start_station
## 1: Townsend at 7th
## 2: San Francisco Caltrain 2 (330 Townsend)
## 3: Harry Bridges Plaza (Ferry Building)
## 4: 2nd at Townsend
## 5: Market at 10th
## 6: 2nd at South Park
## 7: Embarcadero at Sansome
## 8: Market at Sansome
## 9: Steuart at Market
## 10: Embarcadero at Folsom
## 11: San Francisco Caltrain (Townsend at 4th)
## 12: Temporary Transbay Terminal (Howard at Beale)
## 13: 5th at Howard
## 14: Market at 4th
## 15: 2nd at Folsom
## 16: Mountain View Caltrain Station
## 17: Powell Street BART
## 18: Grant Avenue at Columbus Avenue
## 19: Howard at 2nd
## 20: Yerba Buena Center of the Arts (3rd @ Howard)
## 21: Mountain View City Hall
## 22: Civic Center BART (7th at Market)
## 23: Santa Clara at Almaden
## 24: San Jose Diridon Caltrain Station
## 25: Castro Street and El Camino Real
## 26: Beale at Market
## 27: Embarcadero at Bryant
## 28: South Van Ness at Market
## 29: San Antonio Shopping Center
## 30: San Antonio Caltrain Station
## 31: Embarcadero at Vallejo
## 32: Evelyn Park and Ride
## 33: Broadway St at Battery St
## 34: Clay at Battery
## 35: Spear at Folsom
## 36: Davis at Jackson
## 37: Commercial at Montgomery
## 38: San Pedro Square
## 39: Mechanics Plaza (Market at Battery)
## 40: Powell at Post (Union Square)
## 41: San Jose City Hall
## 42: University and Emerson
## 43: Golden Gate at Polk
## 44: Cowper at University
## 45: Paseo de San Antonio
## 46: Adobe on Almaden
## 47: Post at Kearny
## 48: St James Park
## 49: Arena Green / SAP Center
## 50: Palo Alto Caltrain Station
## 51: San Salvador at 1st
## 52: Ryland Park
## 53: SJSU - San Salvador at 9th
## 54: Santa Clara County Civic Center
## 55: San Francisco City Hall
## 56: Washington at Kearny
## 57: MLK Library
## 58: San Jose Civic Center
## 59: Rengstorff Avenue / California Street
## 60: Japantown
## 61: California Ave Caltrain Station
## 62: Redwood City Caltrain Station
## 63: Mezes Park
## 64: SJSU 4th at San Carlos
## 65: Park at Olive
## 66: Post at Kearney
## 67: Franklin at Maple
## 68: Redwood City Public Library
## 69: Redwood City Medical Center
## 70: San Mateo County Center
## 71: Stanford in Redwood City
## 72: Washington at Kearney
## 73: Broadway at Main
## 74: San Jose Government Center
## start_station
## end_station
## 1: San Francisco Caltrain (Townsend at 4th)
## 2: Townsend at 7th
## 3: Embarcadero at Sansome
## 4: Harry Bridges Plaza (Ferry Building)
## 5: San Francisco Caltrain (Townsend at 4th)
## 6: Market at Sansome
## 7: Steuart at Market
## 8: 2nd at South Park
## 9: San Francisco Caltrain (Townsend at 4th)
## 10: San Francisco Caltrain (Townsend at 4th)
## 11: Temporary Transbay Terminal (Howard at Beale)
## 12: San Francisco Caltrain (Townsend at 4th)
## 13: San Francisco Caltrain (Townsend at 4th)
## 14: San Francisco Caltrain (Townsend at 4th)
## 15: Market at Sansome
## 16: Mountain View City Hall
## 17: San Francisco Caltrain (Townsend at 4th)
## 18: Market at Sansome
## 19: San Francisco Caltrain (Townsend at 4th)
## 20: San Francisco Caltrain (Townsend at 4th)
## 21: Mountain View Caltrain Station
## 22: Townsend at 7th
## 23: San Jose Diridon Caltrain Station
## 24: Santa Clara at Almaden
## 25: Mountain View Caltrain Station
## 26: San Francisco Caltrain (Townsend at 4th)
## 27: San Francisco Caltrain (Townsend at 4th)
## 28: San Francisco Caltrain (Townsend at 4th)
## 29: San Antonio Caltrain Station
## 30: San Antonio Shopping Center
## 31: Steuart at Market
## 32: Mountain View Caltrain Station
## 33: San Francisco Caltrain (Townsend at 4th)
## 34: San Francisco Caltrain (Townsend at 4th)
## 35: San Francisco Caltrain (Townsend at 4th)
## 36: San Francisco Caltrain (Townsend at 4th)
## 37: San Francisco Caltrain (Townsend at 4th)
## 38: San Jose Diridon Caltrain Station
## 39: Market at 4th
## 40: San Francisco Caltrain (Townsend at 4th)
## 41: San Jose Diridon Caltrain Station
## 42: University and Emerson
## 43: San Francisco Caltrain (Townsend at 4th)
## 44: Palo Alto Caltrain Station
## 45: San Jose Diridon Caltrain Station
## 46: San Jose Diridon Caltrain Station
## 47: San Francisco Caltrain (Townsend at 4th)
## 48: San Jose Diridon Caltrain Station
## 49: Santa Clara at Almaden
## 50: Cowper at University
## 51: MLK Library
## 52: Santa Clara County Civic Center
## 53: San Jose Diridon Caltrain Station
## 54: Ryland Park
## 55: Powell Street BART
## 56: Powell Street BART
## 57: San Jose Diridon Caltrain Station
## 58: San Jose Civic Center
## 59: Mountain View Caltrain Station
## 60: San Jose Diridon Caltrain Station
## 61: Palo Alto Caltrain Station
## 62: Mezes Park
## 63: Redwood City Caltrain Station
## 64: Santa Clara at Almaden
## 65: Palo Alto Caltrain Station
## 66: Washington at Kearney
## 67: Redwood City Caltrain Station
## 68: Redwood City Caltrain Station
## 69: Redwood City Caltrain Station
## 70: San Mateo County Center
## 71: Redwood City Caltrain Station
## 72: Powell Street BART
## 73: Broadway at Main
## 74: Japantown
## end_station
# Find the first and last ride for each start_station
first_last <- batrips[order(start_date),
.(start_date = c(head(start_date, 1), tail(start_date, 1))),
by = .(start_station)]
first_last
## start_station start_date
## 1: San Francisco City Hall 2014-01-01 00:14:00
## 2: San Francisco City Hall 2014-12-31 22:06:00
## 3: Embarcadero at Sansome 2014-01-01 00:17:00
## 4: Embarcadero at Sansome 2014-12-31 22:08:00
## 5: Steuart at Market 2014-01-01 00:23:00
## ---
## 144: Santa Clara County Civic Center 2014-12-31 15:32:00
## 145: Ryland Park 2014-04-10 09:10:00
## 146: Ryland Park 2014-12-31 07:56:00
## 147: Stanford in Redwood City 2014-09-03 19:41:00
## 148: Stanford in Redwood City 2014-12-22 16:56:00
relevant_cols <- c("start_station", "end_station", "start_date", "end_date", "duration")
# Find the row corresponding to the shortest trip per month
shortest <- batrips[, .SD[which.min(duration)], by = month(start_date), .SDcols = relevant_cols]
shortest
## month start_station
## 1: 1 2nd at Townsend
## 2: 2 San Francisco Caltrain (Townsend at 4th)
## 3: 3 Mechanics Plaza (Market at Battery)
## 4: 4 South Van Ness at Market
## 5: 5 Market at 10th
## 6: 6 Powell Street BART
## 7: 7 Powell Street BART
## 8: 8 Broadway St at Battery St
## 9: 9 Civic Center BART (7th at Market)
## 10: 10 Yerba Buena Center of the Arts (3rd @ Howard)
## 11: 11 Temporary Transbay Terminal (Howard at Beale)
## 12: 12 2nd at South Park
## end_station start_date
## 1: 2nd at Townsend 2014-01-21 13:01:00
## 2: San Francisco Caltrain (Townsend at 4th) 2014-02-08 14:28:00
## 3: Mechanics Plaza (Market at Battery) 2014-03-18 17:50:00
## 4: South Van Ness at Market 2014-04-12 04:28:00
## 5: Market at 10th 2014-05-14 20:11:00
## 6: Powell Street BART 2014-06-23 17:31:00
## 7: Powell Street BART 2014-07-14 14:09:00
## 8: Broadway St at Battery St 2014-08-15 15:15:00
## 9: Civic Center BART (7th at Market) 2014-09-04 10:53:00
## 10: Yerba Buena Center of the Arts (3rd @ Howard) 2014-10-04 19:21:00
## 11: Temporary Transbay Terminal (Howard at Beale) 2014-11-07 09:45:00
## 12: 2nd at South Park 2014-12-08 11:38:00
## end_date duration
## 1: 2014-01-21 13:02:00 60
## 2: 2014-02-08 14:29:00 61
## 3: 2014-03-18 17:51:00 60
## 4: 2014-04-12 04:29:00 61
## 5: 2014-05-14 20:12:00 60
## 6: 2014-06-23 17:32:00 60
## 7: 2014-07-14 14:10:00 60
## 8: 2014-08-15 15:16:00 60
## 9: 2014-09-04 10:54:00 60
## 10: 2014-10-04 19:22:00 60
## 11: 2014-11-07 09:46:00 60
## 12: 2014-12-08 11:39:00 60
# Find the total number of unique start stations and zip codes per month
unique_station_month <- batrips[, lapply(.SD, FUN=uniqueN),
by = month(start_date),
.SDcols = c("start_station", "zip_code")]
unique_station_month
## month start_station zip_code
## 1: 1 68 710
## 2: 2 69 591
## 3: 3 69 894
## 4: 4 70 895
## 5: 5 70 1073
## 6: 6 70 1028
## 7: 7 70 1068
## 8: 8 70 1184
## 9: 9 70 971
## 10: 10 70 991
## 11: 11 70 769
## 12: 12 68 586
Chapter 4 - Reference Semantics
Adding and Updating Columns by Reference:
:=(is_dur_gt_1hour = NULL, start_station = toupper(start_station))] # note that := is the function that is being called and that NULL means “delete the column”Grouped Aggregations:
Advanced Aggregations:
:=(end_dur_first = duration[1], end_dur_last = duration[.N]), by = end_station]med_dur <- median(dur, na.rm = TRUE) if (med_dur < 600) "short" Example code includes:
data(batrips, package="bikeshare14")
batrips <- as.data.table(batrips)
batrips_new = batrips
makeNA <- sample(1:nrow(batrips), round(0.05*nrow(batrips)), replace=FALSE)
batrips_new[makeNA, "duration"] <- NA
# Add a new column, duration_hour
batrips[, duration_hour := duration/3600]
# Print untidy
# untidy[1:2]
# Fix spelling in the second row of start_station
# untidy[2, start_station:="San Francisco City Hall"]
# Replace negative duration values with NA
# untidy[duration < 0, duration:=NA]
# Add a new column equal to total trips for every start station
batrips[, trips_N:=.N, by = start_station]
# Add new column for every start_station and end_station
batrips[, duration_mean:=mean(duration), by = .(start_station, end_station)]
# Calculate the mean duration for each month
batrips_new[, mean_dur:=mean(duration, na.rm=TRUE), by = month(start_date)]
# Replace NA values in duration with the mean value of duration for that month
batrips_new[, mean_dur := mean(duration, na.rm = TRUE), by = month(start_date)][is.na(duration), duration:=round(mean_dur,0)]
# Delete the mean_dur column by reference
batrips_new[, mean_dur := mean(duration, na.rm = TRUE), by = month(start_date)][is.na(duration), duration := mean_dur][, mean_dur:=NULL]
# Add columns using the LHS := RHS form
batrips[, c("mean_duration", "median_duration"):=.(mean(duration), as.integer(round(median(duration), 0))), by=start_station]
# Add columns using the functional form
batrips[, `:=`(mean_duration=mean(duration), median_duration=as.integer(round(median(duration), 0))), by = start_station]
# Add the mean_duration column
batrips[duration > 600, mean_duration:=mean(duration), by=.(start_station, end_station)]
Chapter 5 - Importing and Exporting Data
Fast data reading with fread():
Advanced file reading:
Fast data writing with fwrite():
Example code includes:
data(batrips, package="bikeshare14")
batrips <- as.data.table(batrips)
readr::write_csv(batrips, "./RInputFiles/_batrips.csv")
# Use read.csv() to import batrips
system.time(read.csv("./RInputFiles/_batrips.csv"))
## user system elapsed
## 3.46 0.13 3.67
# Use fread() to import batrips
system.time(fread("./RInputFiles/_batrips.csv"))
## user system elapsed
## 0.34 0.04 0.38
cat('id,"name",val
29192,"Robert Whitaker", 200
49301 ,"Elisa Waters",190
', file="./RInputFiles/_sample.csv")
# Import using read.csv()
csv_file <- read.csv("./RInputFiles/_sample.csv", fill = NA, quote = "", stringsAsFactors = FALSE, strip.white = TRUE, header = TRUE)
csv_file
## id X.name. val
## 1 29192 "Robert Whitaker" 200
## 2 49301 "Elisa Waters" 190
# Import using fread()
csv_file <- fread("./RInputFiles/_sample.csv")
csv_file
## id name val
## 1: 29192 Robert Whitaker 200
## 2: 49301 Elisa Waters 190
cat("id,name,val
29192,Robert Whitaker, 200
49301 ,Elisa Waters,190
34456 , Karla Schmidt,458
", file="./RInputFiles/_sample.csv")
# Select "id" and "val" columns
select_columns <- fread("./RInputFiles/_sample.csv", select=c("id", "val"))
select_columns
## id val
## 1: 29192 200
## 2: 49301 190
## 3: 34456 458
# Drop the "val" column
drop_column <- fread("./RInputFiles/_sample.csv", drop=c("val"))
drop_column
## id name
## 1: 29192 Robert Whitaker
## 2: 49301 Elisa Waters
## 3: 34456 Karla Schmidt
cat('id,"name",val
29192,"Robert Whitaker", 200
49301 , Elisa Waters,190
34456 , Karla Schmidt,458
END-OF-DATA
METADATA
attr;value
date;"2018-01-01"
data;"cash payment"
', file="./RInputFiles/_sample.csv")
# Import the file
entire_file <- fread("./RInputFiles/_sample.csv")
## Warning in fread("./RInputFiles/_sample.csv"): Stopped early on line 5.
## Expected 3 fields but found 0. Consider fill=TRUE and comment.char=. First
## discarded non-empty line: <<END-OF-DATA>>
entire_file
## id name val
## 1: 29192 Robert Whitaker 200
## 2: 49301 Elisa Waters 190
## 3: 34456 Karla Schmidt 458
# Import the file while avoiding the warning
only_data <- fread("./RInputFiles/_sample.csv", nrows=3)
only_data
## id name val
## 1: 29192 Robert Whitaker 200
## 2: 49301 Elisa Waters 190
## 3: 34456 Karla Schmidt 458
# Import only the metadata
only_metadata <- fread("./RInputFiles/_sample.csv", skip="attr;value")
only_metadata
## attr value
## 1: date 2018-01-01
## 2: data cash payment
cat('id,name,val
9002019291929192,Robert Whitaker, 200
9200129401349301 ,Elisa Waters,190
9200149429834456 , Karla Schmidt,458
', file="./RInputFiles/_sample.csv")
# Import the file using fread
fread_import <- fread("./RInputFiles/_sample.csv")
# Import the file using read.csv
base_import <- read.csv("./RInputFiles/_sample.csv")
# Check the class of id column
class(fread_import$id)
## [1] "integer64"
class(base_import$id)
## [1] "numeric"
cat('c1,c2,c3,c3.1,c5,n1,n2,n3,n4,n5
aa,bb,cc,dd,ee,1,2,3,4,5
ff,gg,hh,ii,jj,6,7,8,9,10
', file="./RInputFiles/_sample.csv")
# Import using read.csv with defaults
base_r_defaults <- read.csv("./RInputFiles/_sample.csv")
str(base_r_defaults)
## 'data.frame': 2 obs. of 10 variables:
## $ c1 : Factor w/ 2 levels "aa","ff": 1 2
## $ c2 : Factor w/ 2 levels "bb","gg": 1 2
## $ c3 : Factor w/ 2 levels "cc","hh": 1 2
## $ c3.1: Factor w/ 2 levels "dd","ii": 1 2
## $ c5 : Factor w/ 2 levels "ee","jj": 1 2
## $ n1 : int 1 6
## $ n2 : int 2 7
## $ n3 : int 3 8
## $ n4 : int 4 9
## $ n5 : int 5 10
# Import using read.csv
base_r <- read.csv("./RInputFiles/_sample.csv",
colClasses = c(rep("factor", 4), "character", "integer", rep("numeric", 4))
)
str(base_r)
## 'data.frame': 2 obs. of 10 variables:
## $ c1 : Factor w/ 2 levels "aa","ff": 1 2
## $ c2 : Factor w/ 2 levels "bb","gg": 1 2
## $ c3 : Factor w/ 2 levels "cc","hh": 1 2
## $ c3.1: Factor w/ 2 levels "dd","ii": 1 2
## $ c5 : chr "ee" "jj"
## $ n1 : int 1 6
## $ n2 : num 2 7
## $ n3 : num 3 8
## $ n4 : num 4 9
## $ n5 : num 5 10
# Import using fread
import_fread <- fread("./RInputFiles/_sample.csv", colClasses = list(factor=1:4, numeric=7:10))
str(import_fread)
## Classes 'data.table' and 'data.frame': 2 obs. of 10 variables:
## $ c1 : Factor w/ 2 levels "aa","ff": 1 2
## $ c2 : Factor w/ 2 levels "bb","gg": 1 2
## $ c3 : Factor w/ 2 levels "cc","hh": 1 2
## $ c3.1: Factor w/ 2 levels "dd","ii": 1 2
## $ c5 : chr "ee" "jj"
## $ n1 : int 1 6
## $ n2 : num 2 7
## $ n3 : num 3 8
## $ n4 : num 4 9
## $ n5 : num 5 10
## - attr(*, ".internal.selfref")=<externalptr>
cat('id,name,val
9002019291929192,Robert Whitaker,
9200129401349301 ,Elisa Waters,190
9200149429834456 , Karla Schmidt
', file="./RInputFiles/_sample.csv")
# Import the file and note the warning message
incorrect <- fread("./RInputFiles/_sample.csv")
## Warning in fread("./RInputFiles/_sample.csv"): Discarded single-line
## footer: <<9200149429834456 , Karla Schmidt>>
incorrect
## id name val
## 1: 9002019291929192 Robert Whitaker NA
## 2: 9200129401349301 Elisa Waters 190
# Import the file correctly
correct <- fread("./RInputFiles/_sample.csv", fill=TRUE)
correct
## id name val
## 1: 9002019291929192 Robert Whitaker NA
## 2: 9200129401349301 Elisa Waters 190
## 3: 9200149429834456 Karla Schmidt NA
# Import the file using na.strings
missing_values <- fread("./RInputFiles/_sample.csv", na.strings="##")
## Warning in fread("./RInputFiles/_sample.csv", na.strings = "##"): Discarded
## single-line footer: <<9200149429834456 , Karla Schmidt>>
missing_values
## id name val
## 1: 9002019291929192 Robert Whitaker NA
## 2: 9200129401349301 Elisa Waters 190
dt <- data.table(id=c(29192L, 49301L, 34456L),
name=c("Robert, Whitaker", "Elisa, Waters", "Karla, Schmidt"),
vals=list(c(144, 48, 32), c(22, 289), 458)
)
dt
## id name vals
## 1: 29192 Robert, Whitaker 144, 48, 32
## 2: 49301 Elisa, Waters 22,289
## 3: 34456 Karla, Schmidt 458
# Write dt to fwrite.txt
fwrite(dt, "./RInputFiles/_fwrite.txt")
# Import the file using readLines()
readLines("./RInputFiles/_fwrite.txt")
## [1] "id,name,vals"
## [2] "29192,\"Robert, Whitaker\",144|48|32"
## [3] "49301,\"Elisa, Waters\",22|289"
## [4] "34456,\"Karla, Schmidt\",458"
# Import the file using fread()
fread("./RInputFiles/_fwrite.txt")
## id name vals
## 1: 29192 Robert, Whitaker 144|48|32
## 2: 49301 Elisa, Waters 22|289
## 3: 34456 Karla, Schmidt 458
batrips_dates <- batrips[1:5, c("start_date", "end_date")]
batrips_dates
## start_date end_date
## 1: 2014-01-01 00:14:00 2014-01-01 00:21:00
## 2: 2014-01-01 00:14:00 2014-01-01 00:21:00
## 3: 2014-01-01 00:17:00 2014-01-01 00:42:00
## 4: 2014-01-01 00:23:00 2014-01-01 00:50:00
## 5: 2014-01-01 00:23:00 2014-01-01 00:50:00
# Write batrips_dates to file using "ISO" format
fwrite(batrips_dates, "./RInputFiles/_iso.txt", dateTimeAs="ISO")
# Import the file back
iso <- fread("./RInputFiles/_iso.txt")
iso
## start_date end_date
## 1: 2014-01-01T08:14:00Z 2014-01-01T08:21:00Z
## 2: 2014-01-01T08:14:00Z 2014-01-01T08:21:00Z
## 3: 2014-01-01T08:17:00Z 2014-01-01T08:42:00Z
## 4: 2014-01-01T08:23:00Z 2014-01-01T08:50:00Z
## 5: 2014-01-01T08:23:00Z 2014-01-01T08:50:00Z
# Write batrips_dates to file using "squash" format
fwrite(batrips_dates, "./RInputFiles/_squash.txt", dateTimeAs="squash")
# Import the file back
squash <- fread("./RInputFiles/_squash.txt")
squash
## start_date end_date
## 1: 20140101081400000 20140101082100000
## 2: 20140101081400000 20140101082100000
## 3: 20140101081700000 20140101084200000
## 4: 20140101082300000 20140101085000000
## 5: 20140101082300000 20140101085000000
# Write batrips_dates to file using "epoch" format
fwrite(batrips_dates, "./RInputFiles/_epoch.txt", dateTimeAs="epoch")
# Import the file back
epoch <- fread("./RInputFiles/_epoch.txt")
epoch
## start_date end_date
## 1: 1388564040 1388564460
## 2: 1388564040 1388564460
## 3: 1388564220 1388565720
## 4: 1388564580 1388566200
## 5: 1388564580 1388566200
# Use write.table() to write batrips
system.time(write.table(batrips, "./RInputFiles/_base-r.txt"))
## user system elapsed
## 12.51 0.45 13.59
# Use fwrite() to write batrips
system.time(fwrite(batrips, "./RInputFiles/_data-table.txt"))
## user system elapsed
## 0.24 0.03 0.15
Chapter 1 - Introduction and Classic Puzzles
Introduction:
Birthday Problem:
Monty Hall:
Example code includes:
# Set seed to 1
set.seed(1)
# Write a function to roll k dice
roll_dice <- function(k){
all_rolls <- sample(c(1,2,3,4,5,6), k, replace = TRUE)
final_answer <- sum(all_rolls)
return(final_answer)
}
# Run the function to roll five dice
roll_dice(5)
## [1] 22
# Initialize a vector to store the output
output <- rep(NA, 10000)
# Loop for 10000 iterations
for(i in 1:10000){
# Fill in the output vector with the result from rolling two dice
output[i] <- roll_dice(2)
}
set.seed(1)
n <- 50
match <- 0
# Simulate 10000 rooms and check for matches in each room
for(i in 1:10000){
birthdays <- sample(1:365, n, replace = TRUE)
if(length(unique(birthdays)) < n){ match <- match + 1 }
}
# Calculate the estimated probability of a match and print it
p_match <- match/10000
print(p_match)
## [1] 0.971
# Calculate the probability of a match for a room size of 50
pbirthday(50)
## [1] 0.97
# Define the vector of sample sizes
room_sizes <- 1:50
# Run the pbirthday function within sapply on the vector of sample sizes
match_probs <- sapply(room_sizes, FUN=pbirthday)
# Create the plot
plot(match_probs ~ room_sizes)
set.seed(1)
doors <- c(1,2,3)
# Randomly select one of the doors to have the prize
prize <- sample(x = doors, size = 1)
initial_choice <- 1
# Check if the initial choice equals the prize
if(prize == initial_choice){
print("The initial choice was correct!")
}
print(prize)
## [1] 2
set.seed(1)
doors <- c(1,2,3)
# Define counter
win_count <- 0
# Run 10000 iterations of the game
for(i in 1:10000){
prize <- sample(x = doors, size = 1)
initial_choice <- 1
if(initial_choice == prize){ win_count <- win_count + 1 }
}
# Print the answer
print(win_count / 10000)
## [1] 0.336
reveal_door <- function(doors, prize, initial_choice){
if(prize == initial_choice){
# Sample at random from the two remaining doors
reveal <- sample(doors[-prize], 1)
} else {
reveal <- doors[-c(prize, initial_choice)]
}
}
set.seed(1)
prize <- sample(doors,1)
initial_choice <- 1
# Use the reveal_door function to do the reveal
reveal <- reveal_door(doors, prize, initial_choice)
# Switch to the remaining door
final_choice <- doors[-c(initial_choice, reveal)]
print(final_choice)
## [1] 2
# Check whether the final choice equals the prize
if(final_choice==prize){
print("The final choice is correct!")
}
## [1] "The final choice is correct!"
# Initialize the win counter
win_count <- 0
for(i in 1:10000){
prize <- sample(doors,1)
initial_choice <- 1
reveal <- reveal_door(doors, prize, initial_choice)
final_choice <- doors[-c(initial_choice, reveal)]
if(final_choice == prize){
# Increment the win counter
win_count <- win_count + 1
}
}
# Print the estimated probability of winning
print(win_count / 10000)
## [1] 0.666
Chapter 2 - Games with Dice
Yahtzee:
Settlers of Catan:
all_rolls <- sample(c(1,2,3,4,5,6), k, replace = TRUE) final_answer <- sum(all_rolls) Craps:
Example code includes:
# Calculate the size of the sample space
s_space <- 6**5
# Calculate the probability of a Yahtzee
p_yahtzee <- 6 / s_space
# Print the answer
print(p_yahtzee)
## [1] 0.000772
s_space <- 6^5
# Calculate the probabilities
p_12345 <- factorial(5) / s_space
p_23456 <- factorial(5) / s_space
p_large_straight <- p_12345 + p_23456
# Print the large straight probability
print(p_large_straight)
## [1] 0.0309
s_space <- 6^5
# Calculate the number of denominations possible
n_denom <- factorial(6) / factorial(4)
# Calculate the number of ways to form the groups
n_groupings <- choose(5, 3)
# Calculate the total number of full houses
n_full_house <- n_denom * n_groupings
# Calculate and print the answer
print(n_full_house / s_space)
## [1] 0.0386
set.seed(1)
# Simulate one game (60 rolls) and store the result
rolls <- replicate(60, roll_dice(2))
# Display the result
table(rolls)
## rolls
## 2 4 5 6 7 8 9 10 11 12
## 3 5 7 7 13 9 9 3 3 1
set.seed(1)
counter <- 0
for(i in 1:10000){
# Roll two dice 60 times
rolls <- replicate(60, roll_dice(2))
# Check whether 2 or 12 was rolled more than twice
if(sum(rolls==2) > 2 | sum(rolls==12) > 2) { counter <- counter + 1 }
}
# Print the answer
print(counter/10000)
## [1] 0.414
roll_after_point <- function(point){
new_roll <- 0
# Roll until either a 7 or the point is rolled
while( (new_roll != point) & (new_roll != 7) ){
new_roll <- roll_dice(2)
if(new_roll == 7){ won <- FALSE }
# Check whether the new roll gives a win
if(new_roll == point){ won <- TRUE }
}
return(won)
}
evaluate_first_roll <- function(roll){
# Check whether the first roll gives an immediate win
if(roll %in% c(7, 11)){ won <- TRUE }
# Check whether the first roll gives an immediate loss
if(roll %in% c(2, 3, 12)){ won <- FALSE }
if(roll %in% c(4,5,6,8,9,10) ){
# Roll until the point or a 7 is rolled and store the win/lose outcome
won <- roll_after_point(roll)
}
return(won)
}
set.seed(1)
won <- rep(NA, 10000)
for(i in 1:10000){
# Shooter's first roll
roll <- roll_dice(2)
# Determine result and store it
won[i] <- evaluate_first_roll(roll)
}
sum(won)/10000
## [1] 0.494
Chapter 3 - Inspired from the Web
Factoring a Quadratic:
Four Digit iPhone Passcodes:
Sign Error Cancellations:
Example code includes:
is_factorable <- function(a,b,c){
# Check whether solutions are imaginary
if(b^2 - 4*a*c < 0){
return(FALSE)
# Designate when the next section should run
} else {
sqrt_discriminant <- sqrt(b^2 - 4*a*c)
# return TRUE if quadratic is factorable
return(sqrt_discriminant == round(sqrt_discriminant))
}
}
counter <- 0
# Nested for loop
for(a in 1:100){
for(b in 1:100){
for(c in 1:100){
# Check whether factorable
if(is_factorable(a, b, c)){ counter <- counter + 1 }
}
}
}
print(counter / 100^3)
## [1] 0.0164
counter <- 0
# Store known values
values <- c(3, 4, 5, 9)
passcode = values
for(i in 1:10000){
# Create the guess
guess <- sample(values, replace=FALSE)
# Check condition
if(identical(passcode, guess)){ counter <- counter + 1 }
}
print(counter/10000)
## [1] 0.0445
counter <- 0
# Store known values
unique_values <- c(2, 4, 7)
passcode = c(unique_values, unique_values[1])
for(i in 1:10000){
# Pick repeated value
all_values <- c(unique_values, sample(unique_values, 1))
# Make guess
guess <- sample(all_values, replace=FALSE)
if(identical(passcode, guess)){ counter <- counter + 1 }
}
print(counter / 10000)
## [1] 0.026
set.seed(1)
# Run 10000 iterations, 0.1 sign switch probability
switch_a <- rbinom(10000, 3, prob=0.1)
# Calculate probability of correct answer
mean(switch_a/2==round(switch_a/2))
## [1] 0.764
# Run 10000 iterations, 0.45 sign switch probability
switch_b <- rbinom(10000, 3, prob=0.45)
# Calculate probability of correct answer
mean(switch_b/2==round(switch_b/2))
## [1] 0.508
set.seed(1)
counter <- 0
for(i in 1:10000){
# Simulate switches
each_switch <- sapply(c(0.49, 0.1), FUN=rbinom, size=1, n=1)
# Simulate switches
num_switches <- sum(each_switch)
# Check solution
if(num_switches/2 == round(num_switches/2)){ counter <- counter + 1 }
}
print(counter/10000)
## [1] 0.5
Chapter 4 - Poker
Texas Hold’em:
Consecutive Cashes:
von Neumann Model of Poker:
Wrap Up:
Example code includes:
p_win <- 8 / 46
curr_pot <- 50
bet <- 10
# Define vector of probabilities
probs <- c(p_win, 1-p_win)
# Define vector of values
values <- c(curr_pot, -bet)
# Calculate expected value
sum(probs*values)
## [1] 0.435
outs <- c(0:25)
# Calculate probability of not winning
p_no_outs <- choose(47-outs, 2) /choose(47, 2)
# Calculate probability of winning
p_win <- 1 - p_no_outs
print(p_win)
## [1] 0.0000 0.0426 0.0842 0.1249 0.1647 0.2035 0.2414 0.2784 0.3145 0.3497
## [11] 0.3839 0.4172 0.4496 0.4810 0.5116 0.5412 0.5698 0.5976 0.6244 0.6503
## [21] 0.6753 0.6994 0.7225 0.7447 0.7660 0.7863
players <- c(1:60)
count <- 0
for(i in 1:10000){
cash_year1 <- sample(players, 6)
cash_year2 <- sample(players, 6)
# Find those who cashed both years
cash_both <- intersect(cash_year1, cash_year2)
# Check whether anyone cashed both years
if(length(cash_both) > 0){ count <- count + 1 }
}
print(count/10000)
## [1] 0.49
check_for_five <- function(cashed){
# Find intersection of five years
all_five <- Reduce(intersect, list(cashed[, 1], cashed[, 2], cashed[, 3], cashed[, 4], cashed[, 5]))
# Check intersection
if(length(all_five) > 0){
return(TRUE)
# Specify when to return FALSE
} else { return(FALSE) }
}
players <- c(1:6000)
count <- 0
for(i in 1:10000){
# Create matrix of cashing players
cashes <- replicate(5, sample(players, 600, replace=FALSE))
# Check for five time winners
if(check_for_five(cashes)){ count <- count + 1 }
}
print(count/10000)
## [1] 0.0604
# Generate values for both players
A <- runif(1)
B <- runif(1)
# Check winner
if(A > B){
print("Player A wins")
} else {
print("Player B wins")
}
## [1] "Player A wins"
print(A)
## [1] 0.706
print(B)
## [1] 0.0876
one_round <- function(bet_cutoff){
a <- runif(n = 1)
b <- runif(n = 1)
# Fill in betting condition
if(b > bet_cutoff){
# Return result of bet
return(ifelse(b > a, 1, -1))
} else {
return(0)
}
}
b_win <- rep(NA, 10000)
for(i in 1:10000){
# Run one and store result
b_win[i] <- one_round(0.5)
}
# Print expected value
mean(b_win)
## [1] 0.241
Chapter 1 - Introduction to Highcharter
Introduction:
Two highcharter paradigms:
Data going forward:
Example code includes:
load("./RInputFiles/stock_prices_xts.RData")
load("./RInputFiles/stock_tidy_tibble_prices.RData")
load("./RInputFiles/stock_wide_tibble_returns.RData")
str(stock_prices_xts)
## An 'xts' object on 2012-12-31/2017-12-29 containing:
## Data: num [1:1260, 1:5] 251 257 258 259 268 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:5] "AMZN" "JPM" "DIS" "GOOG" ...
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## List of 2
## $ src : chr "yahoo"
## $ updated: POSIXct[1:1], format: "2018-12-15 16:31:23"
str(stock_tidy_tibble_prices)
## Classes 'tbl_df', 'tbl' and 'data.frame': 6300 obs. of 3 variables:
## $ date : Date, format: "2012-12-31" "2013-01-02" ...
## $ symbol: chr "AMZN" "AMZN" "AMZN" "AMZN" ...
## $ price : num 251 257 258 259 268 ...
str(stock_wide_tibble_returns)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1260 obs. of 6 variables:
## $ date: Date, format: "2012-12-31" "2013-01-02" ...
## $ AMZN: num NA 0.02535 0.00454 0.00259 0.03529 ...
## $ DIS : num NA 0.02597 0.00215 0.01896 -0.02365 ...
## $ GOOG: num NA 0.022187 0.000581 0.019568 -0.004373 ...
## $ JPM : num NA 0.02242 -0.00202 0.01757 0.0011 ...
## $ KO : num NA 0.03656 0 0.00159 -0.0096 ...
load("./RInputFiles/commodities_returns.RData")
load("./RInputFiles/commodities-returns-tidy.RData")
load("./RInputFiles/commodities-xts.RData")
str(commodities_returns)
## Classes 'tbl_df', 'tbl' and 'data.frame': 234 obs. of 4 variables:
## $ date : Date, format: "2017-01-04" "2017-01-05" ...
## $ gold : num 0.002836 0.013637 -0.00671 0.009753 0.000506 ...
## $ platinum : num 0.05683 0.00825 0.019 0.016 -0.00132 ...
## $ palladium: num 0.03897 0.01056 0.01667 0.00309 0 ...
## - attr(*, "na.action")= 'omit' Named int 1 10 11 35 36 37 75 76 85 86 ...
## ..- attr(*, "names")= chr "1" "10" "11" "35" ...
str(commodities_returns_tidy)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 771 obs. of 3 variables:
## $ date : Date, format: "2017-01-03" "2017-01-04" ...
## $ metal : chr "gold" "gold" "gold" "gold" ...
## $ return: num NA 0.00284 0.01364 -0.00671 0.00975 ...
## - attr(*, "vars")= chr "metal"
## - attr(*, "drop")= logi TRUE
## - attr(*, "indices")=List of 3
## ..$ : int 0 1 2 3 4 5 6 7 8 9 ...
## ..$ : int 514 515 516 517 518 519 520 521 522 523 ...
## ..$ : int 257 258 259 260 261 262 263 264 265 266 ...
## - attr(*, "group_sizes")= int 257 257 257
## - attr(*, "biggest_group_size")= int 257
## - attr(*, "labels")='data.frame': 3 obs. of 1 variable:
## ..$ metal: chr "gold" "palladium" "platinum"
## ..- attr(*, "vars")= chr "metal"
## ..- attr(*, "drop")= logi TRUE
str(commodities_xts)
## An 'xts' object on 2017-01-03/2017-12-29 containing:
## Data: num [1:257, 1:6] 1162 1165 1181 1173 1185 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:6] "gold" "copper" "oil" "silver" ...
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## NULL
# Load the highcharter package
library(highcharter)
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
library(zoo)
quantmod::getSymbols("XLK", src="yahoo", from="2012-12-31", to="2017-12-31")
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
## [1] "XLK"
fix_vars <- function(x) { tolower(str_split(x, fixed("."))[[1]][2]) }
xlk_prices <- XLK
names(xlk_prices) <- sapply(names(XLK), FUN=fix_vars) %>% unname()
# Build a candlestick chart
hchart(xlk_prices, type = "candlestick")
# Build a ohlc chart
hchart(xlk_prices, type = "ohlc")
# Build a line chart
hchart(xlk_prices$close, type = "line")
# Show the dates
head(index(xlk_prices))
## [1] "2012-12-31" "2013-01-02" "2013-01-03" "2013-01-04" "2013-01-07"
## [6] "2013-01-08"
# Use the base function and set the correct chart type
highchart(type = "stock") %>%
hc_add_series(xlk_prices)
xlk_prices_tibble <- fortify.zoo(xlk_prices) %>%
as_tibble() %>%
rename("date"="Index")
head(xlk_prices_tibble)
## # A tibble: 6 x 7
## date open high low close volume adjusted
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2012-12-31 28.3 29.0 28.2 28.8 15186300 25.8
## 2 2013-01-02 29.6 29.8 29.5 29.8 15041500 26.7
## 3 2013-01-03 29.8 29.9 29.5 29.6 9789700 26.5
## 4 2013-01-04 29.6 29.6 29.4 29.5 6832900 26.4
## 5 2013-01-07 29.4 29.5 29.3 29.5 7688000 26.4
## 6 2013-01-08 29.5 29.5 29.2 29.4 5565200 26.3
# Create a line chart of the 'close' prices
hchart(xlk_prices_tibble, hcaes(x = date, y = close), type = "line")
# Create a line chart of the open prices
hchart(xlk_prices_tibble, hcaes(x = date, y = open), type = "line")
# Inspect the first rows of the xts data object
head(stock_prices_xts)
## AMZN JPM DIS GOOG KO
## 2012-12-31 251 37.4 45.8 351 30.0
## 2013-01-02 257 38.3 47.0 359 31.1
## 2013-01-03 258 38.2 47.1 359 31.1
## 2013-01-04 259 38.9 48.0 367 31.2
## 2013-01-07 268 38.9 46.9 365 30.9
## 2013-01-08 266 39.0 46.7 364 30.6
# Extract and show the GOOG column from the xts object
head(stock_prices_xts$GOOG)
## GOOG
## 2012-12-31 351
## 2013-01-02 359
## 2013-01-03 359
## 2013-01-04 367
## 2013-01-07 365
## 2013-01-08 364
# Display the date index from the xts object
head(index(stock_prices_xts))
## [1] "2012-12-31" "2013-01-02" "2013-01-03" "2013-01-04" "2013-01-07"
## [6] "2013-01-08"
# Extract and show the DIS column from the xts object
head(stock_prices_xts$DIS)
## DIS
## 2012-12-31 45.8
## 2013-01-02 47.0
## 2013-01-03 47.1
## 2013-01-04 48.0
## 2013-01-07 46.9
## 2013-01-08 46.7
stock_wide_tibble_prices <- stock_tidy_tibble_prices %>%
tidyr::spread(symbol, price)
# Inspect the first rows of the wide tibble object
head(stock_wide_tibble_prices)
## # A tibble: 6 x 6
## date AMZN DIS GOOG JPM KO
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2012-12-31 251. 45.8 351. 37.4 30.0
## 2 2013-01-02 257. 47.0 359. 38.3 31.1
## 3 2013-01-03 258. 47.1 359. 38.2 31.1
## 4 2013-01-04 259. 48.0 367. 38.9 31.2
## 5 2013-01-07 268. 46.9 365. 38.9 30.9
## 6 2013-01-08 266. 46.7 364. 39.0 30.6
# Extract and show the GOOG column from the wide tibble data
head(stock_wide_tibble_prices$GOOG)
## [1] 351 359 359 367 365 364
# Display the date information from the wide tibble data
head(stock_wide_tibble_prices$date)
## [1] "2012-12-31" "2013-01-02" "2013-01-03" "2013-01-04" "2013-01-07"
## [6] "2013-01-08"
# Extract and show the DIS column from the wide tibble data
head(stock_wide_tibble_prices$DIS)
## [1] 45.8 47.0 47.1 48.0 46.9 46.7
# Inspect the first rows of the tidy tibble object
head(stock_tidy_tibble_prices)
## # A tibble: 6 x 3
## date symbol price
## <date> <chr> <dbl>
## 1 2012-12-31 AMZN 251.
## 2 2013-01-02 AMZN 257.
## 3 2013-01-03 AMZN 258.
## 4 2013-01-04 AMZN 259.
## 5 2013-01-07 AMZN 268.
## 6 2013-01-08 AMZN 266.
# Extract and show the GOOG price data from the tidy tibble data
stock_tidy_tibble_prices %>%
filter(symbol == "GOOG") %>%
head()
## # A tibble: 6 x 3
## date symbol price
## <date> <chr> <dbl>
## 1 2012-12-31 GOOG 351.
## 2 2013-01-02 GOOG 359.
## 3 2013-01-03 GOOG 359.
## 4 2013-01-04 GOOG 367.
## 5 2013-01-07 GOOG 365.
## 6 2013-01-08 GOOG 364.
# Display the date information from the tidy tibble
head(stock_tidy_tibble_prices$date)
## [1] "2012-12-31" "2013-01-02" "2013-01-03" "2013-01-04" "2013-01-07"
## [6] "2013-01-08"
# Extract and show the DIS price data from the tidy tibble data
stock_tidy_tibble_prices %>%
filter(symbol == "DIS") %>%
head()
## # A tibble: 6 x 3
## date symbol price
## <date> <chr> <dbl>
## 1 2012-12-31 DIS 45.8
## 2 2013-01-02 DIS 47.0
## 3 2013-01-03 DIS 47.1
## 4 2013-01-04 DIS 48.0
## 5 2013-01-07 DIS 46.9
## 6 2013-01-08 DIS 46.7
Chapter 2 - Highcharter for xts data
Chart the price of one stock in an xts object:
Chart the price of many stocks from xts:
Adding a title, subtitle, and axis labels:
hc_title(text = "5 ETFs Price History") %>% hc_subtitle(text = "daily prices") %>% hc_add_series(etf_prices_xts$SPY, color = "blue", name = "SPY") %>% hc_add_series(etf_prices_xts$IJS, color = "red", name = "IJS") %>% hc_add_series(etf_prices_xts$EEM, color = "green", name = "EEM") %>% hc_add_series(etf_prices_xts$EFA, color = "purple", name = "EFA") %>% Tooltips and legends:
Example code includes:
# Chart the price of KO
highchart(type = "stock") %>%
hc_add_series(stock_prices_xts$KO)
# Fill in the complete highchart code flow to chart GOOG in green
highchart(type = "stock") %>%
hc_add_series(stock_prices_xts$GOOG, color = "green")
# Fill in the complete highchart code flow to chart DIS in purple
highchart(type = "stock") %>%
hc_add_series(stock_prices_xts$DIS, color = "purple")
highchart(type = "stock") %>%
# Add the price of GOOG, colored orange
hc_add_series(stock_prices_xts$GOOG, color = "orange") %>%
# Add the price of DIS, colored black
hc_add_series(stock_prices_xts$DIS, color = "black")
highchart(type = "stock") %>%
# Add the price of KO, colored green
hc_add_series(stock_prices_xts$KO, color = "green") %>%
# Add the price of JPM, colored pink
hc_add_series(stock_prices_xts$JPM, color = "pink")
highchart(type = "stock") %>%
# Add JPM as a blue line called JP Morgan
hc_add_series(stock_prices_xts$JPM, color = "blue", name = "JP Morgan") %>%
# Add KO as a red line called Coke
hc_add_series(stock_prices_xts$KO, color = "red", name = "Coke") %>%
# Add GOOG as a green line named Google
hc_add_series(stock_prices_xts$GOOG, color = "green", name = "Google") %>%
# Add DIS as a purple line named Disney
hc_add_series(stock_prices_xts$DIS, color = "purple", name = "Disney")
highchart(type = "stock") %>%
# Add the stocks to the chart with the correct color and name
hc_add_series(stock_prices_xts$JPM, color = "blue", name = "jpm") %>%
hc_add_series(stock_prices_xts$KO, color = "red", name = "coke") %>%
hc_add_series(stock_prices_xts$GOOG, color = "green", name = "google") %>%
hc_add_series(stock_prices_xts$DIS, color = "purple", name = "disney") %>%
hc_add_series(stock_prices_xts$AMZN, color = "black", name = "amazon")
highchart(type = "stock") %>%
# Supply the text of the title to hc_title()
hc_title(text = "A history of two stocks") %>%
# Supply the text of the subtitle to hc_subtitle()
hc_subtitle(text = "told with lines") %>%
hc_add_series(stock_prices_xts$AMZN, color = "blue", name = "AMZN") %>%
hc_add_series(stock_prices_xts$DIS, color = "red", name = "DIS") %>%
# Supply the text and format of the y-axis
hc_yAxis(title = list(text = "Prices (USD)"), labels = list(format = "${value}"), opposite = FALSE)
highchart(type = "stock") %>%
# Add a title
hc_title(text = "A history of two stocks") %>%
# Add a subtitle
hc_subtitle(text = "told with lines") %>%
hc_add_series(stock_prices_xts$AMZN, color = "blue", name = "AMZN") %>%
hc_add_series(stock_prices_xts$DIS, color = "red", name = "DIS") %>%
# Change the y-axis title
hc_yAxis(title = list(text = "in $$$s"), labels = list(format = "{value} USD"), opposite = FALSE)
highchart(type = "stock") %>%
hc_add_series(stock_prices_xts$AMZN, color = "blue", name = "AMZN") %>%
hc_add_series(stock_prices_xts$DIS, color = "red", name = "DIS") %>%
# Add the dollar sign and y-values on a new line
hc_tooltip(pointFormat = "Daily Price:<br> ${point.y}")
highchart(type = "stock") %>%
hc_add_series(stock_prices_xts$AMZN, color = "blue", name = "AMZN") %>%
hc_add_series(stock_prices_xts$DIS, color = "red", name = "DIS") %>%
hc_add_series(stock_prices_xts$GOOG, color = "green", name = "GOOG") %>%
# Add stock names and round the price
hc_tooltip(pointFormat = "{point.series.name}: ${point.y: .2f}") %>%
# Enable the legend
hc_legend(enabled = TRUE)
# Choose the type of highchart
highchart(type = "stock") %>%
# Add gold, platinum and palladium
hc_add_series(commodities_xts$gold, color = "yellow", name= "Gold") %>%
hc_add_series(commodities_xts$platinum, color = "grey", name= "Platinum") %>%
hc_add_series(commodities_xts$palladium, color = "blue", name= "Palladium") %>%
# Customize the pointFormat of the tooltip
hc_tooltip(pointFormat = "{point.series.name}: ${point.y} ") %>%
hc_title(text = "Gold, Platinum and Palladium 2017") %>%
hc_yAxis(labels = list(format = "${value}"))
Chapter 3 - Highcharter for wide tibble data
Visualizing one stock from wide tibble data:
Visualizing multiple stocks from wide tibble data:
hc_add_series(etf_prices_wide_tibble, hcaes(x = date, y = EEM) type = "line") Scatterplots from etf_wide_tibble:
Mixing chart types from wide tibble data:
hc_yAxis(title = list(text = "EEM Daily returns (%)"), labels = list(format = "{value}%"), opposite = FALSE) %>% hc_xAxis(title = list(text = "SPY Daily returns (%)"), labels = list(format = "{value}%")) %>% hc_tooltip(pointFormat = "{point.date} <br> EEM {point.y: .2f}% <br> SPY: {point.x: .2f}%") Example code includes:
# Visualize DIS as a line chart
hchart(stock_wide_tibble_prices, hcaes(x = date, y = DIS),
type = "line",
# Specify the name
name = "DIS",
# Specify the color
color = "orange"
)
# Create a green line chart of KO
hchart(stock_wide_tibble_prices, hcaes(x = date, y = KO), type = "line", color = "green", name = "KO")
# Create a black line chart of JPM
hchart(stock_wide_tibble_prices, hcaes(x = date, y = JPM), type = "line", color = "black", name = "JPM")
# Create a line chart of KO
hchart(stock_wide_tibble_prices, hcaes(x = date, y = KO), name = "KO", type = "line") %>%
# Add JPM to the chart
hc_add_series(stock_wide_tibble_prices, hcaes(x = date, y = JPM), name = "JPM", type = "line") %>%
# Add DIS to the chart
hc_add_series(stock_wide_tibble_prices, hcaes(x = date, y = DIS), name = "DIS", type = "line") %>%
# Add AMZN to the chart
hc_add_series(stock_wide_tibble_prices, hcaes(x = date, y = AMZN), name = "AMZN", type = "line") %>%
# Enable a shared tooltip
hc_tooltip(shared = TRUE)
hchart(stock_wide_tibble_prices, hcaes(x=date, y=KO), name="KO", type="line", showInLegend = TRUE) %>%
# Add JPM to the chart and show it in the legend
hc_add_series(stock_wide_tibble_prices, hcaes(x=date, y=JPM), name="JPM", type="line", showInLegend=TRUE) %>%
# Add DIS to the chart and show it in the legend
hc_add_series(stock_wide_tibble_prices, hcaes(x=date, y=DIS), name="DIS", type="line", showInLegend=TRUE) %>%
# Add a legend to the chart
hc_legend(enabled = TRUE)
hchart(stock_wide_tibble_prices, hcaes(x = date, y = KO), name = "KO", type = "line") %>%
# Add JPM to the chart
hc_add_series(stock_wide_tibble_prices, hcaes(x=date, y=JPM), name = "JPM", type = "line") %>%
# Enable a shared tooltip
hc_tooltip(shared = TRUE, pointFormat = "{point.series.name}: ${point.y: .2f}<br>") %>%
# Change the text of the title of the y-axis
hc_yAxis(title = list(text = "prices (USD)"))
# Specify a green scatter plot
hchart(stock_wide_tibble_returns, hcaes(x = GOOG, y = JPM),
type = "scatter", color = "green", name = "GOOG v. JPM"
) %>%
# Make the tooltip display the x and y points and percentage sign
hc_tooltip(pointFormat = "GOOG: {point.x: .2f}% <br>JPM: {point.y: .2f}%")
hchart(stock_wide_tibble_returns, hcaes(x = KO, y = AMZN), type = "scatter",
color = "pink", name = "GOOG v. AMZN"
) %>%
# Add a custom tooltip format
hc_tooltip(pointFormat = "{point.date} <br>AMZN: {point.y: .2f}% <br>KO: {point.x: .2f}%")
# Create a scatter plot
hchart(stock_wide_tibble_returns, hcaes(x = KO, y = GOOG), type = "scatter") %>%
# Add the slope variable
hc_add_series(stock_wide_tibble_returns, hcaes(x = KO, y = (KO * 1.15)), type = "line") %>%
# Customize the tooltip to show the date, x-, and y-values
hc_tooltip(pointFormat = "{point.date} <br> GOOG {point.y: .2f}% <br> KO: {point.x: .2f}%")
hchart(stock_wide_tibble_returns, hcaes(x = AMZN, y = DIS), type = "scatter") %>%
hc_add_series(stock_wide_tibble_returns, hcaes(x = AMZN, y = (AMZN * .492)), type = "line",
# Add the tooltip argument
tooltip = list(
# Change the header of the line tooltip
headerFormat = "DIS/AMZN linear relationship<br>",
# Customize the y value display
pointFormat = "{point.y: .2f}%"
)
) %>%
# Customize the scatter tooltip
hc_tooltip(pointFormat = "{point.date} <br> DIS: {point.y: .2f}% <br> AMZN: {point.x: .2f}%")
# Start the hchart flow for the returns data
hchart(commodities_returns, type = "scatter",
hcaes(x = gold, y = palladium, date = date), color = "pink"
) %>%
# Customize the tooltip
hc_tooltip(pointFormat = "date: {point.date} <br>palladium: {point.y:.4f} <br>gold: {point.x:.4f} ") %>%
hc_title(text = "Palladium Versus Gold 2017")
Chapter 4 - Highcharter for tidy tibble data
Tidy data:
Chart many ETF from a tidy tibble:
Creativity with tidy data:
Tidy tooltips:
mutate(type = case_when(symbol == "EFA" ~ "international", symbol == "EEM" ~ "emerging", symbol == "AGG" ~ "bond", symbol == "IJS" ~ "small-cap", symbol == "SPY" ~ "market")) mutate(type = case_when(symbol == "EFA" ~ "international", symbol == "EEM" ~ "emerging", symbol == "AGG" ~ "bond", symbol == "IJS" ~ "small-cap", symbol == "SPY" ~ "market")) %>% hchart(., hcaes(x = date, y = price, group = symbol), type = "line") %>% hc_tooltip(pointFormat = " {point.symbol}: ${point.price: .2f <br> fund type: {point.type}") summarize(mean = mean(returns), st_dev = sd(returns), max_return = max(returns), min_return = min(returns)) %>% hchart(., hcaes(x = symbol, y = mean, group = symbol), type = "column") %>% hc_tooltip(pointFormat = "sd: {point.st_dev: .4f}% <br> max: {point.max_return: .4f}% <br> min: {point.min_return: .4f}%") Wrap up:
Example code includes:
stock_tidy_tibble_prices %>%
# Filter by the symbol
filter(symbol == "KO") %>%
# Pass the data, choose the mappings and create a line chart
hchart(., hcaes(x = date, y = price), type = "line", color = "red")
stock_tidy_tibble_prices %>%
# Filter the data by symbol
filter(symbol == "GOOG") %>%
# Pass the data
hchart(., hcaes(x = date, y = price), type = "line", color = "purple")
# Chart AMZN as a black line
stock_tidy_tibble_prices %>%
filter(symbol == "AMZN") %>%
hchart(., hcaes(x = date, y = price), type = "line", color = "black")
stock_tidy_tibble_prices %>%
# Pass in the data
hchart(., hcaes(x = date, y = price, group = symbol), type = "line") %>%
# Title the chart
hc_title(text = "Daily Prices from Tidy Tibble") %>%
# Customize the y-axis and move the labels to the left
hc_yAxis(title = list(text = "Prices (USD)"), labels = list(format = "${value}"), opposite = FALSE)
stock_tidy_tibble_prices %>%
# Filter the data so it doesn't inclue JP Morgan
filter(symbol != "JPM") %>%
# Pass in the data and define the aesthetic mappings
hchart(., hcaes(x = date, y = price, group = symbol), type = "line")
stock_tidy_tibble_prices %>%
# Filter the data so it doesn't include Disney and Coke
filter(!(symbol %in% c("DIS", "KO"))) %>%
# Pass in the data and define the aesthetic mappings
hchart(., hcaes(x = date, y = price, group = symbol), type = "line")
stock_tidy_tibble_returns <- stock_tidy_tibble_prices %>%
arrange(symbol, date) %>%
group_by(symbol) %>%
mutate(returns = price / lag(price) - 1) %>%
filter(!is.na(returns))
str(stock_tidy_tibble_returns)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 6295 obs. of 4 variables:
## $ date : Date, format: "2013-01-02" "2013-01-03" ...
## $ symbol : chr "AMZN" "AMZN" "AMZN" "AMZN" ...
## $ price : num 257 258 259 268 266 ...
## $ returns: num 0.02567 0.00455 0.00259 0.03593 -0.00775 ...
## - attr(*, "groups")=Classes 'tbl_df', 'tbl' and 'data.frame': 5 obs. of 2 variables:
## ..$ symbol: chr "AMZN" "DIS" "GOOG" "JPM" ...
## ..$ .rows :List of 5
## .. ..$ : int 1 2 3 4 5 6 7 8 9 10 ...
## .. ..$ : int 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 ...
## .. ..$ : int 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 ...
## .. ..$ : int 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 ...
## .. ..$ : int 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 ...
## ..- attr(*, ".drop")= logi TRUE
stock_tidy_tibble_returns %>%
# Calculate the standard deviation and mean of returns
summarize(std_dev = sd(returns), mean = mean(returns)) %>%
hchart(., hcaes(x = symbol, y = std_dev, color = symbol, size = mean), type = "scatter") %>%
hc_title(text = "Standard Dev and Mean Return")
stock_tidy_tibble_returns %>%
summarize(avg_returns = mean(returns), vol_risk = sd(returns), risk_return = vol_risk/avg_returns) %>%
# Pass the summary statistics to hchart
hchart(., hcaes(x = symbol, y = risk_return, group = symbol), type = "column") %>%
hc_title(text = "Risk/Return") %>%
hc_subtitle(text = "lower bars are better")
stock_tidy_tibble_prices %>%
mutate(sector = case_when(symbol == "AMZN" ~ "tech", symbol == "GOOG" ~ "tech", symbol == "DIS" ~ "fun",
symbol == "JPM" ~ "bank", symbol == "KO" ~ "food")) %>%
hchart(., hcaes(x = date, y = price, group = symbol), type = "line") %>%
# Set the tooltip display with curly braces
hc_tooltip(pointFormat = "{point.symbol}: ${point.y: .2f}<br> sector: {point.sector}")
# Calculate the mean, sd, max and min returns
stock_tidy_tibble_returns %>%
summarize(mean = mean(returns), st_dev = sd(returns),
max_return = max(returns), min_return = min(returns)
) %>%
hchart(., hcaes(x = symbol, y = st_dev, group = symbol), type = "column") %>%
hc_tooltip(pointFormat = "mean: {point.mean: .4f}% <br>max: {point.max_return: .4f}% <br>min: {point.min_return: .4f}%")
# Pass the tidy tibble to hchart()
hchart(commodities_returns_tidy, hcaes(x=date, y=return, group=metal, date=date), type="scatter") %>%
hc_title(text = "Gold, Palladium and Platinum Returns 2017") %>%
# Customize the tooltip
hc_tooltip(pointFormat = "date: {point.date} <br>{point.metal}: {point.return: .4f}")
Chapter 1 - Introduction to Advanced Dimensionality Reduction
Exploring the MNIST Dataset:
Distance Metrics:
PCA and t-SNE:
Exampe code includes:
load("./RInputFiles/mnist-sample-200.RData")
load("./RInputFiles/fashion_mnist_500.RData")
load("./RInputFiles/creditcard.RData")
dim(mnist_sample)
## [1] 200 785
dim(fashion_mnist)
## [1] 500 785
str(creditcard)
## Classes 'data.table' and 'data.frame': 28923 obs. of 31 variables:
## $ Time : num 406 472 4462 6986 7519 ...
## $ V1 : num -2.31 -3.04 -2.3 -4.4 1.23 ...
## $ V2 : num 1.95 -3.16 1.76 1.36 3.02 ...
## $ V3 : num -1.61 1.09 -0.36 -2.59 -4.3 ...
## $ V4 : num 4 2.29 2.33 2.68 4.73 ...
## $ V5 : num -0.522 1.36 -0.822 -1.128 3.624 ...
## $ V6 : num -1.4265 -1.0648 -0.0758 -1.7065 -1.3577 ...
## $ V7 : num -2.537 0.326 0.562 -3.496 1.713 ...
## $ V8 : num 1.3917 -0.0678 -0.3991 -0.2488 -0.4964 ...
## $ V9 : num -2.77 -0.271 -0.238 -0.248 -1.283 ...
## $ V10 : num -2.772 -0.839 -1.525 -4.802 -2.447 ...
## $ V11 : num 3.202 -0.415 2.033 4.896 2.101 ...
## $ V12 : num -2.9 -0.503 -6.56 -10.913 -4.61 ...
## $ V13 : num -0.5952 0.6765 0.0229 0.1844 1.4644 ...
## $ V14 : num -4.29 -1.69 -1.47 -6.77 -6.08 ...
## $ V15 : num 0.38972 2.00063 -0.69883 -0.00733 -0.33924 ...
## $ V16 : num -1.141 0.667 -2.282 -7.358 2.582 ...
## $ V17 : num -2.83 0.6 -4.78 -12.6 6.74 ...
## $ V18 : num -0.0168 1.7253 -2.6157 -5.1315 3.0425 ...
## $ V19 : num 0.417 0.283 -1.334 0.308 -2.722 ...
## $ V20 : num 0.12691 2.10234 -0.43002 -0.17161 0.00906 ...
## $ V21 : num 0.517 0.662 -0.294 0.574 -0.379 ...
## $ V22 : num -0.035 0.435 -0.932 0.177 -0.704 ...
## $ V23 : num -0.465 1.376 0.173 -0.436 -0.657 ...
## $ V24 : num 0.3202 -0.2938 -0.0873 -0.0535 -1.6327 ...
## $ V25 : num 0.0445 0.2798 -0.1561 0.2524 1.4889 ...
## $ V26 : num 0.178 -0.145 -0.543 -0.657 0.567 ...
## $ V27 : num 0.2611 -0.2528 0.0396 -0.8271 -0.01 ...
## $ V28 : num -0.1433 0.0358 -0.153 0.8496 0.1468 ...
## $ Amount: num 0 529 240 59 1 ...
## $ Class : chr "1" "1" "1" "1" ...
## - attr(*, ".internal.selfref")=<externalptr>
# Have a look at the MNIST dataset names
names(mnist_sample)
## [1] "label" "pixel0" "pixel1" "pixel2" "pixel3" "pixel4"
## [7] "pixel5" "pixel6" "pixel7" "pixel8" "pixel9" "pixel10"
## [13] "pixel11" "pixel12" "pixel13" "pixel14" "pixel15" "pixel16"
## [19] "pixel17" "pixel18" "pixel19" "pixel20" "pixel21" "pixel22"
## [25] "pixel23" "pixel24" "pixel25" "pixel26" "pixel27" "pixel28"
## [31] "pixel29" "pixel30" "pixel31" "pixel32" "pixel33" "pixel34"
## [37] "pixel35" "pixel36" "pixel37" "pixel38" "pixel39" "pixel40"
## [43] "pixel41" "pixel42" "pixel43" "pixel44" "pixel45" "pixel46"
## [49] "pixel47" "pixel48" "pixel49" "pixel50" "pixel51" "pixel52"
## [55] "pixel53" "pixel54" "pixel55" "pixel56" "pixel57" "pixel58"
## [61] "pixel59" "pixel60" "pixel61" "pixel62" "pixel63" "pixel64"
## [67] "pixel65" "pixel66" "pixel67" "pixel68" "pixel69" "pixel70"
## [73] "pixel71" "pixel72" "pixel73" "pixel74" "pixel75" "pixel76"
## [79] "pixel77" "pixel78" "pixel79" "pixel80" "pixel81" "pixel82"
## [85] "pixel83" "pixel84" "pixel85" "pixel86" "pixel87" "pixel88"
## [91] "pixel89" "pixel90" "pixel91" "pixel92" "pixel93" "pixel94"
## [97] "pixel95" "pixel96" "pixel97" "pixel98" "pixel99" "pixel100"
## [103] "pixel101" "pixel102" "pixel103" "pixel104" "pixel105" "pixel106"
## [109] "pixel107" "pixel108" "pixel109" "pixel110" "pixel111" "pixel112"
## [115] "pixel113" "pixel114" "pixel115" "pixel116" "pixel117" "pixel118"
## [121] "pixel119" "pixel120" "pixel121" "pixel122" "pixel123" "pixel124"
## [127] "pixel125" "pixel126" "pixel127" "pixel128" "pixel129" "pixel130"
## [133] "pixel131" "pixel132" "pixel133" "pixel134" "pixel135" "pixel136"
## [139] "pixel137" "pixel138" "pixel139" "pixel140" "pixel141" "pixel142"
## [145] "pixel143" "pixel144" "pixel145" "pixel146" "pixel147" "pixel148"
## [151] "pixel149" "pixel150" "pixel151" "pixel152" "pixel153" "pixel154"
## [157] "pixel155" "pixel156" "pixel157" "pixel158" "pixel159" "pixel160"
## [163] "pixel161" "pixel162" "pixel163" "pixel164" "pixel165" "pixel166"
## [169] "pixel167" "pixel168" "pixel169" "pixel170" "pixel171" "pixel172"
## [175] "pixel173" "pixel174" "pixel175" "pixel176" "pixel177" "pixel178"
## [181] "pixel179" "pixel180" "pixel181" "pixel182" "pixel183" "pixel184"
## [187] "pixel185" "pixel186" "pixel187" "pixel188" "pixel189" "pixel190"
## [193] "pixel191" "pixel192" "pixel193" "pixel194" "pixel195" "pixel196"
## [199] "pixel197" "pixel198" "pixel199" "pixel200" "pixel201" "pixel202"
## [205] "pixel203" "pixel204" "pixel205" "pixel206" "pixel207" "pixel208"
## [211] "pixel209" "pixel210" "pixel211" "pixel212" "pixel213" "pixel214"
## [217] "pixel215" "pixel216" "pixel217" "pixel218" "pixel219" "pixel220"
## [223] "pixel221" "pixel222" "pixel223" "pixel224" "pixel225" "pixel226"
## [229] "pixel227" "pixel228" "pixel229" "pixel230" "pixel231" "pixel232"
## [235] "pixel233" "pixel234" "pixel235" "pixel236" "pixel237" "pixel238"
## [241] "pixel239" "pixel240" "pixel241" "pixel242" "pixel243" "pixel244"
## [247] "pixel245" "pixel246" "pixel247" "pixel248" "pixel249" "pixel250"
## [253] "pixel251" "pixel252" "pixel253" "pixel254" "pixel255" "pixel256"
## [259] "pixel257" "pixel258" "pixel259" "pixel260" "pixel261" "pixel262"
## [265] "pixel263" "pixel264" "pixel265" "pixel266" "pixel267" "pixel268"
## [271] "pixel269" "pixel270" "pixel271" "pixel272" "pixel273" "pixel274"
## [277] "pixel275" "pixel276" "pixel277" "pixel278" "pixel279" "pixel280"
## [283] "pixel281" "pixel282" "pixel283" "pixel284" "pixel285" "pixel286"
## [289] "pixel287" "pixel288" "pixel289" "pixel290" "pixel291" "pixel292"
## [295] "pixel293" "pixel294" "pixel295" "pixel296" "pixel297" "pixel298"
## [301] "pixel299" "pixel300" "pixel301" "pixel302" "pixel303" "pixel304"
## [307] "pixel305" "pixel306" "pixel307" "pixel308" "pixel309" "pixel310"
## [313] "pixel311" "pixel312" "pixel313" "pixel314" "pixel315" "pixel316"
## [319] "pixel317" "pixel318" "pixel319" "pixel320" "pixel321" "pixel322"
## [325] "pixel323" "pixel324" "pixel325" "pixel326" "pixel327" "pixel328"
## [331] "pixel329" "pixel330" "pixel331" "pixel332" "pixel333" "pixel334"
## [337] "pixel335" "pixel336" "pixel337" "pixel338" "pixel339" "pixel340"
## [343] "pixel341" "pixel342" "pixel343" "pixel344" "pixel345" "pixel346"
## [349] "pixel347" "pixel348" "pixel349" "pixel350" "pixel351" "pixel352"
## [355] "pixel353" "pixel354" "pixel355" "pixel356" "pixel357" "pixel358"
## [361] "pixel359" "pixel360" "pixel361" "pixel362" "pixel363" "pixel364"
## [367] "pixel365" "pixel366" "pixel367" "pixel368" "pixel369" "pixel370"
## [373] "pixel371" "pixel372" "pixel373" "pixel374" "pixel375" "pixel376"
## [379] "pixel377" "pixel378" "pixel379" "pixel380" "pixel381" "pixel382"
## [385] "pixel383" "pixel384" "pixel385" "pixel386" "pixel387" "pixel388"
## [391] "pixel389" "pixel390" "pixel391" "pixel392" "pixel393" "pixel394"
## [397] "pixel395" "pixel396" "pixel397" "pixel398" "pixel399" "pixel400"
## [403] "pixel401" "pixel402" "pixel403" "pixel404" "pixel405" "pixel406"
## [409] "pixel407" "pixel408" "pixel409" "pixel410" "pixel411" "pixel412"
## [415] "pixel413" "pixel414" "pixel415" "pixel416" "pixel417" "pixel418"
## [421] "pixel419" "pixel420" "pixel421" "pixel422" "pixel423" "pixel424"
## [427] "pixel425" "pixel426" "pixel427" "pixel428" "pixel429" "pixel430"
## [433] "pixel431" "pixel432" "pixel433" "pixel434" "pixel435" "pixel436"
## [439] "pixel437" "pixel438" "pixel439" "pixel440" "pixel441" "pixel442"
## [445] "pixel443" "pixel444" "pixel445" "pixel446" "pixel447" "pixel448"
## [451] "pixel449" "pixel450" "pixel451" "pixel452" "pixel453" "pixel454"
## [457] "pixel455" "pixel456" "pixel457" "pixel458" "pixel459" "pixel460"
## [463] "pixel461" "pixel462" "pixel463" "pixel464" "pixel465" "pixel466"
## [469] "pixel467" "pixel468" "pixel469" "pixel470" "pixel471" "pixel472"
## [475] "pixel473" "pixel474" "pixel475" "pixel476" "pixel477" "pixel478"
## [481] "pixel479" "pixel480" "pixel481" "pixel482" "pixel483" "pixel484"
## [487] "pixel485" "pixel486" "pixel487" "pixel488" "pixel489" "pixel490"
## [493] "pixel491" "pixel492" "pixel493" "pixel494" "pixel495" "pixel496"
## [499] "pixel497" "pixel498" "pixel499" "pixel500" "pixel501" "pixel502"
## [505] "pixel503" "pixel504" "pixel505" "pixel506" "pixel507" "pixel508"
## [511] "pixel509" "pixel510" "pixel511" "pixel512" "pixel513" "pixel514"
## [517] "pixel515" "pixel516" "pixel517" "pixel518" "pixel519" "pixel520"
## [523] "pixel521" "pixel522" "pixel523" "pixel524" "pixel525" "pixel526"
## [529] "pixel527" "pixel528" "pixel529" "pixel530" "pixel531" "pixel532"
## [535] "pixel533" "pixel534" "pixel535" "pixel536" "pixel537" "pixel538"
## [541] "pixel539" "pixel540" "pixel541" "pixel542" "pixel543" "pixel544"
## [547] "pixel545" "pixel546" "pixel547" "pixel548" "pixel549" "pixel550"
## [553] "pixel551" "pixel552" "pixel553" "pixel554" "pixel555" "pixel556"
## [559] "pixel557" "pixel558" "pixel559" "pixel560" "pixel561" "pixel562"
## [565] "pixel563" "pixel564" "pixel565" "pixel566" "pixel567" "pixel568"
## [571] "pixel569" "pixel570" "pixel571" "pixel572" "pixel573" "pixel574"
## [577] "pixel575" "pixel576" "pixel577" "pixel578" "pixel579" "pixel580"
## [583] "pixel581" "pixel582" "pixel583" "pixel584" "pixel585" "pixel586"
## [589] "pixel587" "pixel588" "pixel589" "pixel590" "pixel591" "pixel592"
## [595] "pixel593" "pixel594" "pixel595" "pixel596" "pixel597" "pixel598"
## [601] "pixel599" "pixel600" "pixel601" "pixel602" "pixel603" "pixel604"
## [607] "pixel605" "pixel606" "pixel607" "pixel608" "pixel609" "pixel610"
## [613] "pixel611" "pixel612" "pixel613" "pixel614" "pixel615" "pixel616"
## [619] "pixel617" "pixel618" "pixel619" "pixel620" "pixel621" "pixel622"
## [625] "pixel623" "pixel624" "pixel625" "pixel626" "pixel627" "pixel628"
## [631] "pixel629" "pixel630" "pixel631" "pixel632" "pixel633" "pixel634"
## [637] "pixel635" "pixel636" "pixel637" "pixel638" "pixel639" "pixel640"
## [643] "pixel641" "pixel642" "pixel643" "pixel644" "pixel645" "pixel646"
## [649] "pixel647" "pixel648" "pixel649" "pixel650" "pixel651" "pixel652"
## [655] "pixel653" "pixel654" "pixel655" "pixel656" "pixel657" "pixel658"
## [661] "pixel659" "pixel660" "pixel661" "pixel662" "pixel663" "pixel664"
## [667] "pixel665" "pixel666" "pixel667" "pixel668" "pixel669" "pixel670"
## [673] "pixel671" "pixel672" "pixel673" "pixel674" "pixel675" "pixel676"
## [679] "pixel677" "pixel678" "pixel679" "pixel680" "pixel681" "pixel682"
## [685] "pixel683" "pixel684" "pixel685" "pixel686" "pixel687" "pixel688"
## [691] "pixel689" "pixel690" "pixel691" "pixel692" "pixel693" "pixel694"
## [697] "pixel695" "pixel696" "pixel697" "pixel698" "pixel699" "pixel700"
## [703] "pixel701" "pixel702" "pixel703" "pixel704" "pixel705" "pixel706"
## [709] "pixel707" "pixel708" "pixel709" "pixel710" "pixel711" "pixel712"
## [715] "pixel713" "pixel714" "pixel715" "pixel716" "pixel717" "pixel718"
## [721] "pixel719" "pixel720" "pixel721" "pixel722" "pixel723" "pixel724"
## [727] "pixel725" "pixel726" "pixel727" "pixel728" "pixel729" "pixel730"
## [733] "pixel731" "pixel732" "pixel733" "pixel734" "pixel735" "pixel736"
## [739] "pixel737" "pixel738" "pixel739" "pixel740" "pixel741" "pixel742"
## [745] "pixel743" "pixel744" "pixel745" "pixel746" "pixel747" "pixel748"
## [751] "pixel749" "pixel750" "pixel751" "pixel752" "pixel753" "pixel754"
## [757] "pixel755" "pixel756" "pixel757" "pixel758" "pixel759" "pixel760"
## [763] "pixel761" "pixel762" "pixel763" "pixel764" "pixel765" "pixel766"
## [769] "pixel767" "pixel768" "pixel769" "pixel770" "pixel771" "pixel772"
## [775] "pixel773" "pixel774" "pixel775" "pixel776" "pixel777" "pixel778"
## [781] "pixel779" "pixel780" "pixel781" "pixel782" "pixel783"
names(fashion_mnist)
## [1] "label" "pixel1" "pixel2" "pixel3" "pixel4" "pixel5"
## [7] "pixel6" "pixel7" "pixel8" "pixel9" "pixel10" "pixel11"
## [13] "pixel12" "pixel13" "pixel14" "pixel15" "pixel16" "pixel17"
## [19] "pixel18" "pixel19" "pixel20" "pixel21" "pixel22" "pixel23"
## [25] "pixel24" "pixel25" "pixel26" "pixel27" "pixel28" "pixel29"
## [31] "pixel30" "pixel31" "pixel32" "pixel33" "pixel34" "pixel35"
## [37] "pixel36" "pixel37" "pixel38" "pixel39" "pixel40" "pixel41"
## [43] "pixel42" "pixel43" "pixel44" "pixel45" "pixel46" "pixel47"
## [49] "pixel48" "pixel49" "pixel50" "pixel51" "pixel52" "pixel53"
## [55] "pixel54" "pixel55" "pixel56" "pixel57" "pixel58" "pixel59"
## [61] "pixel60" "pixel61" "pixel62" "pixel63" "pixel64" "pixel65"
## [67] "pixel66" "pixel67" "pixel68" "pixel69" "pixel70" "pixel71"
## [73] "pixel72" "pixel73" "pixel74" "pixel75" "pixel76" "pixel77"
## [79] "pixel78" "pixel79" "pixel80" "pixel81" "pixel82" "pixel83"
## [85] "pixel84" "pixel85" "pixel86" "pixel87" "pixel88" "pixel89"
## [91] "pixel90" "pixel91" "pixel92" "pixel93" "pixel94" "pixel95"
## [97] "pixel96" "pixel97" "pixel98" "pixel99" "pixel100" "pixel101"
## [103] "pixel102" "pixel103" "pixel104" "pixel105" "pixel106" "pixel107"
## [109] "pixel108" "pixel109" "pixel110" "pixel111" "pixel112" "pixel113"
## [115] "pixel114" "pixel115" "pixel116" "pixel117" "pixel118" "pixel119"
## [121] "pixel120" "pixel121" "pixel122" "pixel123" "pixel124" "pixel125"
## [127] "pixel126" "pixel127" "pixel128" "pixel129" "pixel130" "pixel131"
## [133] "pixel132" "pixel133" "pixel134" "pixel135" "pixel136" "pixel137"
## [139] "pixel138" "pixel139" "pixel140" "pixel141" "pixel142" "pixel143"
## [145] "pixel144" "pixel145" "pixel146" "pixel147" "pixel148" "pixel149"
## [151] "pixel150" "pixel151" "pixel152" "pixel153" "pixel154" "pixel155"
## [157] "pixel156" "pixel157" "pixel158" "pixel159" "pixel160" "pixel161"
## [163] "pixel162" "pixel163" "pixel164" "pixel165" "pixel166" "pixel167"
## [169] "pixel168" "pixel169" "pixel170" "pixel171" "pixel172" "pixel173"
## [175] "pixel174" "pixel175" "pixel176" "pixel177" "pixel178" "pixel179"
## [181] "pixel180" "pixel181" "pixel182" "pixel183" "pixel184" "pixel185"
## [187] "pixel186" "pixel187" "pixel188" "pixel189" "pixel190" "pixel191"
## [193] "pixel192" "pixel193" "pixel194" "pixel195" "pixel196" "pixel197"
## [199] "pixel198" "pixel199" "pixel200" "pixel201" "pixel202" "pixel203"
## [205] "pixel204" "pixel205" "pixel206" "pixel207" "pixel208" "pixel209"
## [211] "pixel210" "pixel211" "pixel212" "pixel213" "pixel214" "pixel215"
## [217] "pixel216" "pixel217" "pixel218" "pixel219" "pixel220" "pixel221"
## [223] "pixel222" "pixel223" "pixel224" "pixel225" "pixel226" "pixel227"
## [229] "pixel228" "pixel229" "pixel230" "pixel231" "pixel232" "pixel233"
## [235] "pixel234" "pixel235" "pixel236" "pixel237" "pixel238" "pixel239"
## [241] "pixel240" "pixel241" "pixel242" "pixel243" "pixel244" "pixel245"
## [247] "pixel246" "pixel247" "pixel248" "pixel249" "pixel250" "pixel251"
## [253] "pixel252" "pixel253" "pixel254" "pixel255" "pixel256" "pixel257"
## [259] "pixel258" "pixel259" "pixel260" "pixel261" "pixel262" "pixel263"
## [265] "pixel264" "pixel265" "pixel266" "pixel267" "pixel268" "pixel269"
## [271] "pixel270" "pixel271" "pixel272" "pixel273" "pixel274" "pixel275"
## [277] "pixel276" "pixel277" "pixel278" "pixel279" "pixel280" "pixel281"
## [283] "pixel282" "pixel283" "pixel284" "pixel285" "pixel286" "pixel287"
## [289] "pixel288" "pixel289" "pixel290" "pixel291" "pixel292" "pixel293"
## [295] "pixel294" "pixel295" "pixel296" "pixel297" "pixel298" "pixel299"
## [301] "pixel300" "pixel301" "pixel302" "pixel303" "pixel304" "pixel305"
## [307] "pixel306" "pixel307" "pixel308" "pixel309" "pixel310" "pixel311"
## [313] "pixel312" "pixel313" "pixel314" "pixel315" "pixel316" "pixel317"
## [319] "pixel318" "pixel319" "pixel320" "pixel321" "pixel322" "pixel323"
## [325] "pixel324" "pixel325" "pixel326" "pixel327" "pixel328" "pixel329"
## [331] "pixel330" "pixel331" "pixel332" "pixel333" "pixel334" "pixel335"
## [337] "pixel336" "pixel337" "pixel338" "pixel339" "pixel340" "pixel341"
## [343] "pixel342" "pixel343" "pixel344" "pixel345" "pixel346" "pixel347"
## [349] "pixel348" "pixel349" "pixel350" "pixel351" "pixel352" "pixel353"
## [355] "pixel354" "pixel355" "pixel356" "pixel357" "pixel358" "pixel359"
## [361] "pixel360" "pixel361" "pixel362" "pixel363" "pixel364" "pixel365"
## [367] "pixel366" "pixel367" "pixel368" "pixel369" "pixel370" "pixel371"
## [373] "pixel372" "pixel373" "pixel374" "pixel375" "pixel376" "pixel377"
## [379] "pixel378" "pixel379" "pixel380" "pixel381" "pixel382" "pixel383"
## [385] "pixel384" "pixel385" "pixel386" "pixel387" "pixel388" "pixel389"
## [391] "pixel390" "pixel391" "pixel392" "pixel393" "pixel394" "pixel395"
## [397] "pixel396" "pixel397" "pixel398" "pixel399" "pixel400" "pixel401"
## [403] "pixel402" "pixel403" "pixel404" "pixel405" "pixel406" "pixel407"
## [409] "pixel408" "pixel409" "pixel410" "pixel411" "pixel412" "pixel413"
## [415] "pixel414" "pixel415" "pixel416" "pixel417" "pixel418" "pixel419"
## [421] "pixel420" "pixel421" "pixel422" "pixel423" "pixel424" "pixel425"
## [427] "pixel426" "pixel427" "pixel428" "pixel429" "pixel430" "pixel431"
## [433] "pixel432" "pixel433" "pixel434" "pixel435" "pixel436" "pixel437"
## [439] "pixel438" "pixel439" "pixel440" "pixel441" "pixel442" "pixel443"
## [445] "pixel444" "pixel445" "pixel446" "pixel447" "pixel448" "pixel449"
## [451] "pixel450" "pixel451" "pixel452" "pixel453" "pixel454" "pixel455"
## [457] "pixel456" "pixel457" "pixel458" "pixel459" "pixel460" "pixel461"
## [463] "pixel462" "pixel463" "pixel464" "pixel465" "pixel466" "pixel467"
## [469] "pixel468" "pixel469" "pixel470" "pixel471" "pixel472" "pixel473"
## [475] "pixel474" "pixel475" "pixel476" "pixel477" "pixel478" "pixel479"
## [481] "pixel480" "pixel481" "pixel482" "pixel483" "pixel484" "pixel485"
## [487] "pixel486" "pixel487" "pixel488" "pixel489" "pixel490" "pixel491"
## [493] "pixel492" "pixel493" "pixel494" "pixel495" "pixel496" "pixel497"
## [499] "pixel498" "pixel499" "pixel500" "pixel501" "pixel502" "pixel503"
## [505] "pixel504" "pixel505" "pixel506" "pixel507" "pixel508" "pixel509"
## [511] "pixel510" "pixel511" "pixel512" "pixel513" "pixel514" "pixel515"
## [517] "pixel516" "pixel517" "pixel518" "pixel519" "pixel520" "pixel521"
## [523] "pixel522" "pixel523" "pixel524" "pixel525" "pixel526" "pixel527"
## [529] "pixel528" "pixel529" "pixel530" "pixel531" "pixel532" "pixel533"
## [535] "pixel534" "pixel535" "pixel536" "pixel537" "pixel538" "pixel539"
## [541] "pixel540" "pixel541" "pixel542" "pixel543" "pixel544" "pixel545"
## [547] "pixel546" "pixel547" "pixel548" "pixel549" "pixel550" "pixel551"
## [553] "pixel552" "pixel553" "pixel554" "pixel555" "pixel556" "pixel557"
## [559] "pixel558" "pixel559" "pixel560" "pixel561" "pixel562" "pixel563"
## [565] "pixel564" "pixel565" "pixel566" "pixel567" "pixel568" "pixel569"
## [571] "pixel570" "pixel571" "pixel572" "pixel573" "pixel574" "pixel575"
## [577] "pixel576" "pixel577" "pixel578" "pixel579" "pixel580" "pixel581"
## [583] "pixel582" "pixel583" "pixel584" "pixel585" "pixel586" "pixel587"
## [589] "pixel588" "pixel589" "pixel590" "pixel591" "pixel592" "pixel593"
## [595] "pixel594" "pixel595" "pixel596" "pixel597" "pixel598" "pixel599"
## [601] "pixel600" "pixel601" "pixel602" "pixel603" "pixel604" "pixel605"
## [607] "pixel606" "pixel607" "pixel608" "pixel609" "pixel610" "pixel611"
## [613] "pixel612" "pixel613" "pixel614" "pixel615" "pixel616" "pixel617"
## [619] "pixel618" "pixel619" "pixel620" "pixel621" "pixel622" "pixel623"
## [625] "pixel624" "pixel625" "pixel626" "pixel627" "pixel628" "pixel629"
## [631] "pixel630" "pixel631" "pixel632" "pixel633" "pixel634" "pixel635"
## [637] "pixel636" "pixel637" "pixel638" "pixel639" "pixel640" "pixel641"
## [643] "pixel642" "pixel643" "pixel644" "pixel645" "pixel646" "pixel647"
## [649] "pixel648" "pixel649" "pixel650" "pixel651" "pixel652" "pixel653"
## [655] "pixel654" "pixel655" "pixel656" "pixel657" "pixel658" "pixel659"
## [661] "pixel660" "pixel661" "pixel662" "pixel663" "pixel664" "pixel665"
## [667] "pixel666" "pixel667" "pixel668" "pixel669" "pixel670" "pixel671"
## [673] "pixel672" "pixel673" "pixel674" "pixel675" "pixel676" "pixel677"
## [679] "pixel678" "pixel679" "pixel680" "pixel681" "pixel682" "pixel683"
## [685] "pixel684" "pixel685" "pixel686" "pixel687" "pixel688" "pixel689"
## [691] "pixel690" "pixel691" "pixel692" "pixel693" "pixel694" "pixel695"
## [697] "pixel696" "pixel697" "pixel698" "pixel699" "pixel700" "pixel701"
## [703] "pixel702" "pixel703" "pixel704" "pixel705" "pixel706" "pixel707"
## [709] "pixel708" "pixel709" "pixel710" "pixel711" "pixel712" "pixel713"
## [715] "pixel714" "pixel715" "pixel716" "pixel717" "pixel718" "pixel719"
## [721] "pixel720" "pixel721" "pixel722" "pixel723" "pixel724" "pixel725"
## [727] "pixel726" "pixel727" "pixel728" "pixel729" "pixel730" "pixel731"
## [733] "pixel732" "pixel733" "pixel734" "pixel735" "pixel736" "pixel737"
## [739] "pixel738" "pixel739" "pixel740" "pixel741" "pixel742" "pixel743"
## [745] "pixel744" "pixel745" "pixel746" "pixel747" "pixel748" "pixel749"
## [751] "pixel750" "pixel751" "pixel752" "pixel753" "pixel754" "pixel755"
## [757] "pixel756" "pixel757" "pixel758" "pixel759" "pixel760" "pixel761"
## [763] "pixel762" "pixel763" "pixel764" "pixel765" "pixel766" "pixel767"
## [769] "pixel768" "pixel769" "pixel770" "pixel771" "pixel772" "pixel773"
## [775] "pixel774" "pixel775" "pixel776" "pixel777" "pixel778" "pixel779"
## [781] "pixel780" "pixel781" "pixel782" "pixel783" "pixel784"
# Labels of the first 6 digits
head(mnist_sample[, 1])
## [1] 5 0 7 0 9 3
# Plot the histogram of the digit labels
hist(mnist_sample$label)
# Compute the basic statistics of all records
# summary(mnist_sample)
# Compute the basic statistics of digits with label 0
# summary(mnist_sample[mnist_sample$label==0,])
# Show the labels of the first 10 records
mnist_sample$label[1:10]
## [1] 5 0 7 0 9 3 4 1 2 6
# Compute the Euclidean distance of the first 10 records
distances <- dist(mnist_sample[1:10, -1], method="euclidean")
# Show the distances values
distances
## 1 2 3 4 5 6 7 8 9
## 2 2186
## 3 2656 2870
## 4 2547 2341 2937
## 5 2407 2959 1976 2871
## 6 2344 2760 2453 2739 2126
## 7 2464 2784 2574 2871 2174 2654
## 8 2150 2669 2000 2586 2067 2273 2408
## 9 2959 3210 2935 3414 2871 3115 2981 2833
## 10 2729 3010 2575 2833 2396 2656 2464 2550 2695
# Plot the numeric matrix of the distances in a heatmap
heatmap(as.matrix(distances), Rowv = NA, symm = TRUE,
labRow = mnist_sample$label[1:10], labCol = mnist_sample$label[1:10]
)
# Minkowski distance or order 3
distances_3 <- dist(mnist_sample[1:10, -1], method="minkowski", p=3)
distances_3
## 1 2 3 4 5 6 7 8 9
## 2 1003
## 3 1170 1229
## 4 1127 1045 1250
## 5 1091 1260 941 1232
## 6 1064 1194 1104 1190 996
## 7 1098 1199 1131 1228 1006 1165
## 8 1007 1169 951 1143 981 1056 1083
## 9 1270 1337 1257 1401 1248 1319 1272 1237
## 10 1187 1268 1134 1219 1085 1167 1096 1133 1181
heatmap(as.matrix(distances_3), Rowv = NA, symm = TRUE,
labRow = mnist_sample$label[1:10], labCol = mnist_sample$label[1:10]
)
# Minkowski distance of order 2
distances_2 <- dist(mnist_sample[1:10, -1], method="minkowski", p=2)
distances_2
## 1 2 3 4 5 6 7 8 9
## 2 2186
## 3 2656 2870
## 4 2547 2341 2937
## 5 2407 2959 1976 2871
## 6 2344 2760 2453 2739 2126
## 7 2464 2784 2574 2871 2174 2654
## 8 2150 2669 2000 2586 2067 2273 2408
## 9 2959 3210 2935 3414 2871 3115 2981 2833
## 10 2729 3010 2575 2833 2396 2656 2464 2550 2695
heatmap(as.matrix(distances_2), Rowv = NA, symm = TRUE,
labRow = mnist_sample$label[1:10], labCol = mnist_sample$label[1:10]
)
# Get the first 10 records
mnist_10 <- mnist_sample[1:10, -1]
# Add 1 to avoid NaN when rescaling
mnist_10_prep <- mnist_10 + 1
# Compute the sums per row
sums <- rowSums(mnist_10_prep)
# Compute KL divergence
distances <- philentropy::distance(mnist_10_prep/sums, method="kullback-leibler")
## Metric: 'kullback-leibler' using unit: 'log'; comparing: 10 vectors.
heatmap(as.matrix(distances), Rowv = NA, symm = TRUE,
labRow = mnist_sample$label[1:10], labCol = mnist_sample$label[1:10]
)
# Get the principal components from PCA
pca_output <- prcomp(mnist_sample[, -1])
# Observe a summary of the output
summary(pca_output)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 581.0793 511.1573 499.0990 440.1563 438.994 381.6497
## Proportion of Variance 0.0964 0.0746 0.0712 0.0553 0.055 0.0416
## Cumulative Proportion 0.0964 0.1711 0.2422 0.2975 0.353 0.3942
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 360.3161 349.933 307.246 298.6138 286.2939 272.0886
## Proportion of Variance 0.0371 0.035 0.027 0.0255 0.0234 0.0211
## Cumulative Proportion 0.4313 0.466 0.493 0.5187 0.5421 0.5632
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 256.8551 254.4171 251.229 233.2123 230.0690 221.486
## Proportion of Variance 0.0188 0.0185 0.018 0.0155 0.0151 0.014
## Cumulative Proportion 0.5821 0.6006 0.619 0.6341 0.6492 0.663
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 217.2549 210.0747 200.5780 198.6126 193.2674 1.87e+02
## Proportion of Variance 0.0135 0.0126 0.0115 0.0113 0.0107 9.96e-03
## Cumulative Proportion 0.6767 0.6894 0.7008 0.7121 0.7228 7.33e-01
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 180.4072 1.74e+02 1.73e+02 1.70e+02 1.66e+02 1.64e+02
## Proportion of Variance 0.0093 8.67e-03 8.56e-03 8.21e-03 7.91e-03 7.66e-03
## Cumulative Proportion 0.7420 7.51e-01 7.59e-01 7.67e-01 7.75e-01 7.83e-01
## PC31 PC32 PC33 PC34 PC35 PC36
## Standard deviation 1.62e+02 1.59e+02 1.54e+02 1.47e+02 1.45e+02 1.43e+02
## Proportion of Variance 7.49e-03 7.21e-03 6.79e-03 6.18e-03 6.01e-03 5.81e-03
## Cumulative Proportion 7.91e-01 7.98e-01 8.05e-01 8.11e-01 8.17e-01 8.23e-01
## PC37 PC38 PC39 PC40 PC41 PC42
## Standard deviation 1.41e+02 1.36e+02 1.32e+02 1.32e+02 1.26e+02 1.23e+02
## Proportion of Variance 5.64e-03 5.26e-03 4.99e-03 4.96e-03 4.56e-03 4.35e-03
## Cumulative Proportion 8.28e-01 8.33e-01 8.38e-01 8.43e-01 8.48e-01 8.52e-01
## PC43 PC44 PC45 PC46 PC47 PC48
## Standard deviation 1.21e+02 1.20e+02 1.19e+02 1.16e+02 1.13e+02 1.12e+02
## Proportion of Variance 4.16e-03 4.14e-03 4.04e-03 3.82e-03 3.64e-03 3.55e-03
## Cumulative Proportion 8.56e-01 8.61e-01 8.65e-01 8.68e-01 8.72e-01 8.76e-01
## PC49 PC50 PC51 PC52 PC53 PC54
## Standard deviation 1.11e+02 1.08e+02 1.07e+02 1.04e+02 1.03e+02 1.01e+02
## Proportion of Variance 3.49e-03 3.36e-03 3.27e-03 3.12e-03 3.02e-03 2.89e-03
## Cumulative Proportion 8.79e-01 8.82e-01 8.86e-01 8.89e-01 8.92e-01 8.95e-01
## PC55 PC56 PC57 PC58 PC59 PC60
## Standard deviation 98.41844 96.19388 96.03764 93.24621 92.09529 91.13565
## Proportion of Variance 0.00277 0.00264 0.00263 0.00248 0.00242 0.00237
## Cumulative Proportion 0.89755 0.90019 0.90283 0.90531 0.90773 0.91010
## PC61 PC62 PC63 PC64 PC65 PC66
## Standard deviation 90.33912 88.43374 87.8278 86.50593 86.13606 84.45834
## Proportion of Variance 0.00233 0.00223 0.0022 0.00214 0.00212 0.00204
## Cumulative Proportion 0.91244 0.91467 0.9169 0.91901 0.92113 0.92317
## PC67 PC68 PC69 PC70 PC71 PC72
## Standard deviation 83.33234 81.00884 80.23066 78.73474 77.59712 77.0849
## Proportion of Variance 0.00198 0.00187 0.00184 0.00177 0.00172 0.0017
## Cumulative Proportion 0.92515 0.92702 0.92886 0.93063 0.93235 0.9341
## PC73 PC74 PC75 PC76 PC77 PC78
## Standard deviation 76.17273 75.46779 75.10109 72.5529 71.73195 71.33942
## Proportion of Variance 0.00166 0.00163 0.00161 0.0015 0.00147 0.00145
## Cumulative Proportion 0.93571 0.93733 0.93894 0.9405 0.94192 0.94337
## PC79 PC80 PC81 PC82 PC83 PC84
## Standard deviation 69.9079 69.10822 68.50964 67.4226 67.22979 65.89167
## Proportion of Variance 0.0014 0.00136 0.00134 0.0013 0.00129 0.00124
## Cumulative Proportion 0.9448 0.94613 0.94747 0.9488 0.95006 0.95130
## PC85 PC86 PC87 PC88 PC89 PC90
## Standard deviation 64.8959 64.62287 63.33802 62.88913 61.9592 61.34202
## Proportion of Variance 0.0012 0.00119 0.00115 0.00113 0.0011 0.00107
## Cumulative Proportion 0.9525 0.95370 0.95484 0.95597 0.9571 0.95814
## PC91 PC92 PC93 PC94 PC95 PC96
## Standard deviation 61.21219 60.39063 59.89925 59.261 58.16600 57.24925
## Proportion of Variance 0.00107 0.00104 0.00102 0.001 0.00097 0.00094
## Cumulative Proportion 0.95921 0.96026 0.96128 0.962 0.96325 0.96419
## PC97 PC98 PC99 PC100 PC101 PC102
## Standard deviation 56.2413 55.55244 55.24024 54.05327 53.72387 53.0814
## Proportion of Variance 0.0009 0.00088 0.00087 0.00083 0.00082 0.0008
## Cumulative Proportion 0.9651 0.96597 0.96684 0.96768 0.96850 0.9693
## PC103 PC104 PC105 PC106 PC107 PC108
## Standard deviation 52.66387 52.23018 51.83450 51.12512 50.70059 50.02274
## Proportion of Variance 0.00079 0.00078 0.00077 0.00075 0.00073 0.00071
## Cumulative Proportion 0.97010 0.97088 0.97164 0.97239 0.97313 0.97384
## PC109 PC110 PC111 PC112 PC113 PC114
## Standard deviation 49.5971 49.21602 49.07307 47.75766 47.10337 46.87950
## Proportion of Variance 0.0007 0.00069 0.00069 0.00065 0.00063 0.00063
## Cumulative Proportion 0.9745 0.97523 0.97592 0.97657 0.97721 0.97784
## PC115 PC116 PC117 PC118 PC119 PC120
## Standard deviation 45.8291 45.6506 45.32725 44.80365 44.55733 43.60770
## Proportion of Variance 0.0006 0.0006 0.00059 0.00057 0.00057 0.00054
## Cumulative Proportion 0.9784 0.9790 0.97962 0.98019 0.98076 0.98130
## PC121 PC122 PC123 PC124 PC125 PC126
## Standard deviation 42.85722 41.9668 41.8655 41.13512 40.74071 40.50967
## Proportion of Variance 0.00052 0.0005 0.0005 0.00048 0.00047 0.00047
## Cumulative Proportion 0.98183 0.9823 0.9828 0.98331 0.98379 0.98425
## PC127 PC128 PC129 PC130 PC131 PC132
## Standard deviation 40.12422 39.84226 39.52123 38.97680 38.38749 38.17122
## Proportion of Variance 0.00046 0.00045 0.00045 0.00043 0.00042 0.00042
## Cumulative Proportion 0.98471 0.98517 0.98561 0.98605 0.98647 0.98689
## PC133 PC134 PC135 PC136 PC137 PC138
## Standard deviation 37.70782 37.6177 37.3220 36.55369 35.92438 35.51238
## Proportion of Variance 0.00041 0.0004 0.0004 0.00038 0.00037 0.00036
## Cumulative Proportion 0.98729 0.9877 0.9881 0.98848 0.98884 0.98920
## PC139 PC140 PC141 PC142 PC143 PC144
## Standard deviation 35.45001 35.15678 34.35638 34.06735 33.29135 33.01342
## Proportion of Variance 0.00036 0.00035 0.00034 0.00033 0.00032 0.00031
## Cumulative Proportion 0.98956 0.98992 0.99025 0.99058 0.99090 0.99121
## PC145 PC146 PC147 PC148 PC149 PC150
## Standard deviation 32.6682 32.1846 31.88599 31.66208 31.18541 30.79636
## Proportion of Variance 0.0003 0.0003 0.00029 0.00029 0.00028 0.00027
## Cumulative Proportion 0.9915 0.9918 0.99210 0.99239 0.99267 0.99294
## PC151 PC152 PC153 PC154 PC155 PC156
## Standard deviation 30.57366 30.32541 29.81812 29.72790 29.16075 28.45111
## Proportion of Variance 0.00027 0.00026 0.00025 0.00025 0.00024 0.00023
## Cumulative Proportion 0.99321 0.99347 0.99372 0.99397 0.99422 0.99445
## PC157 PC158 PC159 PC160 PC161 PC162
## Standard deviation 28.12558 28.00044 27.75274 27.32120 27.00107 26.79261
## Proportion of Variance 0.00023 0.00022 0.00022 0.00021 0.00021 0.00021
## Cumulative Proportion 0.99467 0.99490 0.99512 0.99533 0.99554 0.99574
## PC163 PC164 PC165 PC166 PC167 PC168
## Standard deviation 26.2524 25.99387 25.25316 24.90629 24.69333 24.30137
## Proportion of Variance 0.0002 0.00019 0.00018 0.00018 0.00017 0.00017
## Cumulative Proportion 0.9959 0.99613 0.99632 0.99649 0.99667 0.99684
## PC169 PC170 PC171 PC172 PC173 PC174
## Standard deviation 23.80281 23.76362 23.27539 22.91356 22.85981 22.55678
## Proportion of Variance 0.00016 0.00016 0.00015 0.00015 0.00015 0.00015
## Cumulative Proportion 0.99700 0.99716 0.99731 0.99746 0.99761 0.99776
## PC175 PC176 PC177 PC178 PC179 PC180
## Standard deviation 22.21152 21.76808 21.24362 21.11932 20.94527 20.46394
## Proportion of Variance 0.00014 0.00014 0.00013 0.00013 0.00013 0.00012
## Cumulative Proportion 0.99790 0.99804 0.99816 0.99829 0.99842 0.99854
## PC181 PC182 PC183 PC184 PC185 PC186
## Standard deviation 20.25800 20.19252 19.42600 19.32760 18.6953 18.21247
## Proportion of Variance 0.00012 0.00012 0.00011 0.00011 0.0001 0.00009
## Cumulative Proportion 0.99865 0.99877 0.99888 0.99898 0.9991 0.99918
## PC187 PC188 PC189 PC190 PC191 PC192
## Standard deviation 17.99321 17.53140 17.07084 16.78692 16.39348 16.20101
## Proportion of Variance 0.00009 0.00009 0.00008 0.00008 0.00008 0.00007
## Cumulative Proportion 0.99927 0.99936 0.99944 0.99952 0.99960 0.99968
## PC193 PC194 PC195 PC196 PC197 PC198
## Standard deviation 15.45278 14.49178 13.48301 12.72247 1.2e+01 11.41324
## Proportion of Variance 0.00007 0.00006 0.00005 0.00005 4.0e-05 0.00004
## Cumulative Proportion 0.99974 0.99980 0.99986 0.99990 1.0e+00 0.99998
## PC199 PC200
## Standard deviation 8.49323 2.58e-13
## Proportion of Variance 0.00002 0.00e+00
## Cumulative Proportion 1.00000 1.00e+00
# Store the first two coordinates and the label in a data frame
pca_plot <- data.frame(pca_x = pca_output$x[, 1], pca_y = pca_output$x[, 2],
label = as.factor(mnist_sample$label)
)
# Plot the first two principal components using the true labels as color and shape
ggplot(pca_plot, aes(x = pca_x, y = pca_y, color = label)) +
ggtitle("PCA of MNIST sample") +
geom_text(aes(label = label)) +
theme(legend.position = "none")
tsne_output <- Rtsne::Rtsne(mnist_sample[, -1], PCA = FALSE, dims = 2) # modifying the default parameters
# Explore the tsne_output structure
str(tsne_output)
## List of 14
## $ N : int 200
## $ Y : num [1:200, 1:2] -2.6 1.69 1.23 -7.73 5.02 ...
## $ costs : num [1:200] 0.00562 0.00481 0.000989 0.004309 0.003997 ...
## $ itercosts : num [1:20] 52 52.6 52.4 53.7 53.4 ...
## $ origD : int 50
## $ perplexity : num 30
## $ theta : num 0.5
## $ max_iter : num 1000
## $ stop_lying_iter : int 250
## $ mom_switch_iter : int 250
## $ momentum : num 0.5
## $ final_momentum : num 0.8
## $ eta : num 200
## $ exaggeration_factor: num 12
# Have a look at the first records from the t-SNE output
head(tsne_output$Y)
## [,1] [,2]
## [1,] -2.60 4.919
## [2,] 1.69 8.931
## [3,] 1.23 -0.786
## [4,] -7.73 -9.117
## [5,] 5.02 -1.973
## [6,] -5.77 3.864
# Store the first two coordinates and the label in a data.frame
tsne_plot <- data.frame(tsne_x = tsne_output$Y[, 1], tsne_y = tsne_output$Y[, 2],
label = as.factor(mnist_sample$label)
)
# Plot the t-SNE embedding using the true labels as color and shape
ggplot(tsne_plot, aes(x = tsne_x, y = tsne_y, color = label)) +
ggtitle("T-Sne output") +
geom_text(aes(label = label)) +
theme(legend.position = "none")
Chapter 2 - Introduction to t-SNE
Building a t-SNE Embedding:
Optimal Number of t-SNE Iterations:
Effect of Perplexity Parameter:
Classifying Digits with t-SNE:
Example code includes:
# Compute t-SNE without doing the PCA step
tsne_output <- Rtsne::Rtsne(mnist_sample[, -1], PCA = FALSE, dims = 3)
# Show the obtained embedding coordinates
head(tsne_output$Y)
## [,1] [,2] [,3]
## [1,] -3.90 0.626 2.2402
## [2,] 9.86 -8.222 9.6127
## [3,] -8.74 -5.232 4.4015
## [4,] -9.58 9.981 3.3003
## [5,] -7.68 -3.214 -0.0745
## [6,] -7.74 2.807 7.7508
# Store the first two coordinates and plot them
tsne_plot <- data.frame(tsne_x = tsne_output$Y[, 1], tsne_y = tsne_output$Y[, 2],
digit = as.factor(mnist_sample$label)
)
# Plot the coordinates
ggplot(tsne_plot, aes(x = tsne_x, y = tsne_y, color = digit)) +
ggtitle("t-SNE of MNIST sample") +
geom_text(aes(label = digit)) +
theme(legend.position = "none")
# Inspect the output object's structure
str(tsne_output)
## List of 14
## $ N : int 200
## $ Y : num [1:200, 1:3] -3.9 9.86 -8.74 -9.58 -7.68 ...
## $ costs : num [1:200] 0.00256 0.00461 0.00169 0.00222 0.0029 ...
## $ itercosts : num [1:20] 52.4 53.4 52.6 52.9 51.7 ...
## $ origD : int 50
## $ perplexity : num 30
## $ theta : num 0.5
## $ max_iter : num 1000
## $ stop_lying_iter : int 250
## $ mom_switch_iter : int 250
## $ momentum : num 0.5
## $ final_momentum : num 0.8
## $ eta : num 200
## $ exaggeration_factor: num 12
# Show the K-L divergence of each record after the final iteration
tsne_output$itercosts
## [1] 52.418 53.442 52.558 52.901 51.656 0.802 0.579 0.533 0.521 0.520
## [11] 0.517 0.515 0.513 0.511 0.510 0.510 0.508 0.507 0.504 0.503
tsne_output$costs
## [1] 0.002556 0.004608 0.001691 0.002216 0.002898 0.004183 0.002754 0.001666
## [9] 0.001401 0.002071 0.001997 0.002965 0.002408 0.002127 0.004021 0.001847
## [17] 0.004202 0.001360 0.000661 0.002919 0.001963 0.002013 0.005414 0.002476
## [25] 0.002521 0.002330 0.001597 0.003748 0.002867 0.001238 0.006646 0.002232
## [33] 0.004471 0.002023 0.003331 0.003620 0.003690 0.003046 0.001068 0.003092
## [41] 0.004347 0.002435 0.002224 0.001266 0.002469 0.003183 0.003292 0.005514
## [49] 0.001522 0.003355 0.002538 0.003548 0.000973 0.003724 0.003210 0.005844
## [57] 0.001530 0.003629 0.000501 0.001255 0.002963 0.001798 0.000946 0.001219
## [65] 0.001717 0.000864 0.001107 0.001536 0.001697 0.001502 0.003486 0.004288
## [73] 0.000579 0.004023 0.001357 0.003757 0.000792 0.002060 0.001243 0.001878
## [81] 0.002739 0.002442 0.003554 0.001439 0.004956 0.001095 0.002383 0.002107
## [89] 0.001548 0.002985 0.003448 0.000709 0.003105 0.001277 0.006636 0.004045
## [97] 0.001300 0.001124 0.002436 0.001912 0.001468 0.003581 0.001166 0.001716
## [105] 0.001500 0.001844 0.001151 0.003541 0.002041 0.001340 0.001106 0.001858
## [113] 0.001699 0.001883 0.003128 0.003232 0.003683 0.003367 0.000254 0.004959
## [121] 0.002403 0.005469 0.000558 0.001905 0.004205 0.002494 0.003512 0.003912
## [129] 0.004192 0.003329 0.010506 0.002245 0.002350 0.001941 0.001677 0.003609
## [137] 0.004245 0.000986 0.002019 0.003203 0.001641 0.002979 0.002684 0.001287
## [145] 0.001096 0.003645 0.001900 0.001790 0.002159 0.002323 0.000495 0.001795
## [153] 0.001945 0.001819 0.005875 0.004069 0.004675 0.003174 0.000603 0.000859
## [161] 0.000715 0.000791 0.003574 0.001305 0.001697 0.001860 0.001829 0.001890
## [169] 0.003916 0.002265 0.000837 0.001701 0.003167 0.002093 0.003039 0.004440
## [177] 0.001640 0.002899 0.004424 0.002667 0.001839 0.001788 0.002397 0.001005
## [185] 0.002155 0.000892 0.001803 0.002666 0.004028 0.001752 0.003715 0.001889
## [193] 0.002250 0.003404 0.001618 0.000842 0.001877 0.003156 0.003224 0.001500
# Plot the K-L divergence of each record after the final iteration
plot(tsne_output$itercosts, type = "l")
plot(tsne_output$costs, type = "l")
# Generate a three-dimensional t-SNE embedding without PCA
tsne_output <- Rtsne::Rtsne(mnist_sample[, -1], PCA=FALSE, dims=3)
# Generate a new t-SNE embedding with the same hyper-parameter values
tsne_output_new <- Rtsne::Rtsne(mnist_sample[, -1], PCA=FALSE, dims=3)
# Check if the two outputs are identical
identical(tsne_output, tsne_output_new)
## [1] FALSE
# Generate a three-dimensional t-SNE embedding without PCA
set.seed(1234)
tsne_output <- Rtsne::Rtsne(mnist_sample[, -1], PCA = FALSE, dims = 3)
# Generate a new t-SNE embedding with the same hyper-parameter values
set.seed(1234)
tsne_output_new <- Rtsne::Rtsne(mnist_sample[, -1], PCA = FALSE, dims = 3)
# Check if the two outputs are identical
identical(tsne_output, tsne_output_new)
## [1] TRUE
# Set seed to ensure reproducible results
set.seed(1234)
# Execute a t-SNE with 2000 iterations
tsne_output <- Rtsne::Rtsne(mnist_sample[, -1], PCA=TRUE, dims=2, max_iter=2000)
# Observe the output costs
tsne_output$itercosts
## [1] 53.214 53.951 54.243 52.808 51.851 1.166 0.920 0.836 0.799 0.788
## [11] 0.779 0.771 0.768 0.766 0.763 0.739 0.731 0.722 0.722 0.715
## [21] 0.713 0.701 0.699 0.697 0.695 0.689 0.684 0.684 0.681 0.677
## [31] 0.676 0.676 0.675 0.675 0.674 0.674 0.674 0.674 0.674 0.676
# Get the 50th iteration with the minimum K-L cost
which.min(tsne_output$itercosts)
## [1] 39
# Set seed to ensure reproducible results
set.seed(1234)
# Execute a t-SNE with perplexity 5
tsne_output_5 <- Rtsne::Rtsne(mnist_sample[, -1], perplexity=5, max_iter=1200)
# Observe the returned K-L divergence costs at every 50th iteration
tsne_output_5$itercosts
## [1] 75.889 76.215 74.568 75.288 74.557 2.479 1.575 1.309 1.137 0.999
## [11] 0.932 0.892 0.881 0.848 0.823 0.816 0.815 0.812 0.810 0.806
## [21] 0.804 0.798 0.783 0.765
# Set seed to ensure reproducible results
set.seed(1234)
# Execute a t-SNE with perplexity 20
tsne_output_20 <- Rtsne::Rtsne(mnist_sample[, -1], perplexity=20, max_iter=1200)
# Observe the returned K-L divergence costs at every 50th iteration
tsne_output_20$itercosts
## [1] 56.897 57.863 57.047 57.774 58.292 1.375 1.055 0.917 0.895 0.888
## [11] 0.858 0.819 0.785 0.765 0.762 0.749 0.741 0.737 0.735 0.733
## [21] 0.733 0.733 0.725 0.726
# Set seed to ensure reproducible results
set.seed(1234)
# Execute a t-SNE with perplexity 50
tsne_output_50 <- Rtsne::Rtsne(mnist_sample[, -1], perplexity=50, max_iter=1200)
# Observe the returned K-L divergence costs at every 50th iteration
tsne_output_50$itercosts
## [1] 45.839 45.866 45.343 46.623 46.922 0.929 0.698 0.597 0.583 0.571
## [11] 0.569 0.566 0.565 0.567 0.566 0.567 0.565 0.567 0.567 0.567
## [21] 0.565 0.567 0.568 0.569
# Observe the K-L divergence costs with perplexity 5 and 50
tsne_output_5$itercosts
## [1] 75.889 76.215 74.568 75.288 74.557 2.479 1.575 1.309 1.137 0.999
## [11] 0.932 0.892 0.881 0.848 0.823 0.816 0.815 0.812 0.810 0.806
## [21] 0.804 0.798 0.783 0.765
tsne_output_50$itercosts
## [1] 45.839 45.866 45.343 46.623 46.922 0.929 0.698 0.597 0.583 0.571
## [11] 0.569 0.566 0.565 0.567 0.566 0.567 0.565 0.567 0.567 0.567
## [21] 0.565 0.567 0.568 0.569
# Generate the data frame to visualize the embedding
tsne_plot_5 <- data.frame(tsne_x = tsne_output_5$Y[, 1], tsne_y = tsne_output_5$Y[, 2], digit = as.factor(mnist_sample$label))
tsne_plot_50 <- data.frame(tsne_x = tsne_output_50$Y[, 1], tsne_y = tsne_output_50$Y[, 2], digit = as.factor(mnist_sample$label))
# Plot the obtained embeddings
ggplot(tsne_plot_5, aes(x = tsne_x, y = tsne_y, color = digit)) +
ggtitle("MNIST t-SNE with 1300 iter and Perplexity=5") + geom_text(aes(label = digit)) +
theme(legend.position="none")
ggplot(tsne_plot_50, aes(x = tsne_x, y = tsne_y, color = digit)) +
ggtitle("MNIST t-SNE with 1300 iter and Perplexity=50") + geom_text(aes(label = digit)) +
theme(legend.position="none")
# Prepare the data.frame
tsne_plot <- data.frame(tsne_x = tsne_output_50$Y[1:100, 1],
tsne_y = tsne_output_50$Y[1:100, 2],
digit = as.factor(mnist_sample$label[1:100])
)
# Plot the obtained embedding
ggplot(tsne_plot, aes(x = tsne_x, y = tsne_y, color = digit)) +
ggtitle("MNIST embedding of the first 100 digits") +
geom_text(aes(label = digit)) +
theme(legend.position="none")
# Get the first 5K records and set the column names
dt_prototypes <- as.data.table(tsne_output_50$Y[1:100, ])
setnames(dt_prototypes, c("X","Y"))
# Paste the label column as factor
dt_prototypes[, label := as.factor(mnist_sample[1:100,]$label)]
# Compute the centroids per label
dt_prototypes[, mean_X := mean(X), by = label]
dt_prototypes[, mean_Y := mean(Y), by = label]
# Get the unique records per label
dt_prototypes <- unique(dt_prototypes, by = "label")
dt_prototypes
## X Y label mean_X mean_Y
## 1: -0.307 0.168 5 0.702 -0.137
## 2: -2.547 5.049 0 0.624 0.715
## 3: -3.043 0.658 7 -3.946 0.142
## 4: -2.593 -0.343 9 -2.609 -2.543
## 5: -1.632 -0.530 3 2.309 1.000
## 6: -4.941 1.640 4 -2.938 -1.811
## 7: -0.975 1.433 1 -0.403 1.258
## 8: 2.874 5.434 2 1.153 4.649
## 9: 2.486 -4.996 6 2.321 -3.062
## 10: -0.808 -3.918 8 -0.545 -1.615
# Store the last 100 records in distances and set column names
distances <- as.data.table(tsne_output_50$Y[101:200, ])
setnames(distances, c("X", "Y"))
# Paste the true label
distances[, label := mnist_sample[101:200,]$label]
# Filter only those labels that are 1 or 0
distances <- distances[label == 1 | label == 0]
# Compute Euclidean distance to prototype of digit 1
distances[, dist_1 := sqrt(( (X - dt_prototypes[label == 1,]$mean_X) +
(Y - dt_prototypes[label == 1,]$mean_Y))^2)]
# Compute the basic statistics of distances from records of class 1
summary(distances[label == 1]$dist_1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.045 0.652 0.886 1.121 1.819 2.253
# Compute the basic statistics of distances from records of class 1
summary(distances[label == 0]$dist_1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.16 0.38 1.34 2.70 5.22 6.05
# Plot the histogram of distances of each class
ggplot(distances, aes(x = dist_1, fill = as.factor(label))) +
geom_histogram(binwidth = 5, alpha = .5, position = "identity", show.legend = FALSE) +
ggtitle("Distribution of Euclidean distance 1 vs 0")
Chapter 3 - Using t-SNE with Predictive Models
Credit Card Fraud Detection:
Training Random Forest Models:
Predicting Data:
Visualizing Neural Network Layers:
Example code includes:
# Look at the data dimensions
dim(creditcard)
## [1] 28923 31
# Explore the column names
names(creditcard)
## [1] "Time" "V1" "V2" "V3" "V4" "V5" "V6" "V7"
## [9] "V8" "V9" "V10" "V11" "V12" "V13" "V14" "V15"
## [17] "V16" "V17" "V18" "V19" "V20" "V21" "V22" "V23"
## [25] "V24" "V25" "V26" "V27" "V28" "Amount" "Class"
# Observe some records
str(creditcard)
## Classes 'data.table' and 'data.frame': 28923 obs. of 31 variables:
## $ Time : num 406 472 4462 6986 7519 ...
## $ V1 : num -2.31 -3.04 -2.3 -4.4 1.23 ...
## $ V2 : num 1.95 -3.16 1.76 1.36 3.02 ...
## $ V3 : num -1.61 1.09 -0.36 -2.59 -4.3 ...
## $ V4 : num 4 2.29 2.33 2.68 4.73 ...
## $ V5 : num -0.522 1.36 -0.822 -1.128 3.624 ...
## $ V6 : num -1.4265 -1.0648 -0.0758 -1.7065 -1.3577 ...
## $ V7 : num -2.537 0.326 0.562 -3.496 1.713 ...
## $ V8 : num 1.3917 -0.0678 -0.3991 -0.2488 -0.4964 ...
## $ V9 : num -2.77 -0.271 -0.238 -0.248 -1.283 ...
## $ V10 : num -2.772 -0.839 -1.525 -4.802 -2.447 ...
## $ V11 : num 3.202 -0.415 2.033 4.896 2.101 ...
## $ V12 : num -2.9 -0.503 -6.56 -10.913 -4.61 ...
## $ V13 : num -0.5952 0.6765 0.0229 0.1844 1.4644 ...
## $ V14 : num -4.29 -1.69 -1.47 -6.77 -6.08 ...
## $ V15 : num 0.38972 2.00063 -0.69883 -0.00733 -0.33924 ...
## $ V16 : num -1.141 0.667 -2.282 -7.358 2.582 ...
## $ V17 : num -2.83 0.6 -4.78 -12.6 6.74 ...
## $ V18 : num -0.0168 1.7253 -2.6157 -5.1315 3.0425 ...
## $ V19 : num 0.417 0.283 -1.334 0.308 -2.722 ...
## $ V20 : num 0.12691 2.10234 -0.43002 -0.17161 0.00906 ...
## $ V21 : num 0.517 0.662 -0.294 0.574 -0.379 ...
## $ V22 : num -0.035 0.435 -0.932 0.177 -0.704 ...
## $ V23 : num -0.465 1.376 0.173 -0.436 -0.657 ...
## $ V24 : num 0.3202 -0.2938 -0.0873 -0.0535 -1.6327 ...
## $ V25 : num 0.0445 0.2798 -0.1561 0.2524 1.4889 ...
## $ V26 : num 0.178 -0.145 -0.543 -0.657 0.567 ...
## $ V27 : num 0.2611 -0.2528 0.0396 -0.8271 -0.01 ...
## $ V28 : num -0.1433 0.0358 -0.153 0.8496 0.1468 ...
## $ Amount: num 0 529 240 59 1 ...
## $ Class : chr "1" "1" "1" "1" ...
## - attr(*, ".internal.selfref")=<externalptr>
# Generate a summary
# summary(creditcard)
# Plot a histogram of the transaction time
ggplot(creditcard, aes(x = Time)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Extract positive and negative instances of fraud
creditcard_pos <- creditcard[Class == 1]
creditcard_neg <- creditcard[Class == 0]
# Fix the seed
set.seed(1234)
# Create a new negative balanced dataset by undersampling
creditcard_neg_bal <- creditcard_neg[sample(1:nrow(creditcard_neg), nrow(creditcard_pos)), ]
# Generate a balanced train set
creditcard_train <- rbind(creditcard_pos, creditcard_neg_bal)
# Fix the seed
set.seed(1234)
# Separate x and y sets
train_x <- creditcard_train[, -31]
train_y <- as.factor(creditcard_train$Class)
# Train a random forests
rf_model <- randomForest::randomForest(train_x, train_y, ntree=100)
# Plot the error evolution and variable importance
plot(rf_model)
randomForest::varImpPlot(rf_model)
# Set the seed
set.seed(1234)
# Generate the t-SNE embedding
tsne_output <- Rtsne::Rtsne(as.matrix(creditcard_train[, -31]), check_duplicates = FALSE, PCA=FALSE)
# Generate a data frame to plot the result
tsne_plot <- data.frame(tsne_x = tsne_output$Y[, 1],
tsne_y = tsne_output$Y[, 2],
Class = creditcard_train$Class
)
# Plot the embedding usign ggplot and the label
ggplot(tsne_plot, aes(x = tsne_x, y = tsne_y, color = Class)) +
ggtitle("t-SNE of credit card fraud train set") +
geom_text(aes(label = Class)) + theme(legend.position = "none")
# Fix the seed
set.seed(1234)
# Train a random forest
rf_model_tsne <- randomForest::randomForest(tsne_plot[, c("tsne_x", "tsne_y")],
as.factor(creditcard_train$Class), ntree=100
)
# Plot the error evolution
plot(rf_model_tsne)
# Plot the variable importance
randomForest::varImpPlot(rf_model_tsne)
# Predict on the test set using the random forest
# pred_rf <- predict(rf_model, creditcard_test, type = "prob")
# Plot a probability distibution of the target class
# hist(pred_rf[, 2])
# Compute the area under the curve
# pred <- prediction(pred_rf[, 2], creditcard_test$Class)
# perf <- performance(pred, measure = "auc")
# perf@y.values
# Predict on the test set using the random forest generated with t-SNE features
# pred_rf <- predict(rf_model_tsne, test_x, type = "prob")
# Plot a probability distibution of the target class
# hist(pred_rf[, 2])
# Compute the area under the curve
# pred <- prediction(pred_rf[, 2], creditcard_test$Class)
# perf <- performance(pred, measure="auc")
# perf@y.values
# Observe the dimensions
# dim(layer_128_train)
# Show the first six records of the last ten columns
# head(layer_128_train[, 118:128])
# Generate a summary of all columns
# summary(layer_128_train)
# Set the seed
# set.seed(1234)
# Generate the t-SNE
# tsne_output <- Rtsne(as.matrix(layer_128_train), check_duplicates=FALSE, max_iter=400, perplexity=50)
# Prepare data.frame
# tsne_plot <- data.frame(tsne_x = tsne_output$Y[, 1], tsne_y = tsne_output$Y[, 2],
# Class = creditcard_train$Class
# )
# Plot the data
# ggplot(tsne_plot, aes(x = tsne_x, y = tsne_y, color = Class)) +
# geom_point() +
# ggtitle("Credit card embedding of Last Neural Network Layer")
Chapter 4 - Generalized Low Rank Models
Exploring Fashion MNIST dataset:
Generalized Low Rank Models (GLRM):
Visualizing a GLRM Model:
Dealing with Missing Data and Speeding-Up Models:
Summary and Wrap-Up:
Example code includes:
# Show the dimensions
dim(fashion_mnist)
## [1] 500 785
# Create a summary of the last five columns
summary(fashion_mnist[, 780:785])
## pixel779 pixel780 pixel781 pixel782 pixel783
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.0 Median : 0.0 Median : 0.0 Median : 0.0 Median : 0.0
## Mean : 23.5 Mean : 18.5 Mean : 7.1 Mean : 2.3 Mean : 0.5
## 3rd Qu.: 0.0 3rd Qu.: 1.0 3rd Qu.: 0.0 3rd Qu.: 0.0 3rd Qu.: 0.0
## Max. :224.0 Max. :233.0 Max. :204.0 Max. :171.0 Max. :77.0
## pixel784
## Min. :0
## 1st Qu.:0
## Median :0
## Mean :0
## 3rd Qu.:0
## Max. :0
# Table with the class distribution
table(fashion_mnist$label)
##
## 0 1 2 3 4 5 6 7 8 9
## 42 44 50 49 53 52 59 54 46 51
xy_axis <- data.frame(x=rep(1:28, times=28), y=rep(28:1, each=28))
plot_theme <- list( raster = geom_raster(hjust = 0, vjust = 0),
gradient_fill = scale_fill_gradient(low = "white", high = "black", guide = FALSE),
theme = theme(axis.line = element_blank(), axis.text = element_blank(),
axis.ticks = element_blank(), axis.title = element_blank(),
panel.background = element_blank(), panel.border = element_blank(),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
plot.background = element_blank()
)
)
class_names <- c('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot'
)
# Get the data from the last image
plot_data <- cbind(xy_axis, fill = as.data.frame(t(fashion_mnist[500, -1]))[,1])
# Observe the first records
head(plot_data)
## x y fill
## 1 1 28 0
## 2 2 28 0
## 3 3 28 0
## 4 4 28 0
## 5 5 28 0
## 6 6 28 0
# Plot the image using ggplot()
ggplot(plot_data, aes(x, y, fill = fill)) +
ggtitle(class_names[as.integer(fashion_mnist[500, 1])]) +
plot_theme
# Start a connection with the h2o cluster
h2o::h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 days 18 hours
## H2O cluster timezone: America/Chicago
## H2O data parsing timezone: UTC
## H2O cluster version: 3.26.0.2
## H2O cluster version age: 5 months and 28 days !!!
## H2O cluster name: H2O_started_from_R_Dave_bvu150
## H2O cluster total nodes: 1
## H2O cluster total memory: 4.18 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.6.2 (2019-12-12)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is too old (5 months and 28 days)!
## Please download and install the latest version from http://h2o.ai/download/
# Store the data into h2o cluster
fashion_mnist.hex <- h2o::as.h2o(fashion_mnist, "fashion_mnist.hex")
##
|
| | 0%
|
|======================================================================| 100%
# Launch a GLRM model over fashion_mnist data
model_glrm <- h2o::h2o.glrm(training_frame = fashion_mnist.hex, cols = 2:ncol(fashion_mnist),
k = 2, seed = 123, max_iterations = 100
)
## Warning in .h2o.startModelJob(algo, params, h2oRestApiVersion): Dropping bad and constant columns: [pixel58, pixel29, pixel1, pixel28, pixel784].
##
|
| | 0%
|
|= | 1%
|
|======================================================================| 100%
# Plotting the convergence
plot(model_glrm)
# Launch a GLRM model with normalized fashion_mnist data
model_glrm <- h2o::h2o.glrm(training_frame = fashion_mnist.hex, transform = "NORMALIZE",
cols = 2:ncol(fashion_mnist), k = 2, seed = 123, max_iterations = 100
)
## Warning in .h2o.startModelJob(algo, params, h2oRestApiVersion): Dropping bad and constant columns: [pixel58, pixel29, pixel1, pixel28, pixel784].
##
|
| | 0%
|
|= | 1%
|
|=========================== | 39%
|
|======================================================================| 100%
# Plotting the convergence
plot(model_glrm)
X_matrix <- as.data.table(h2o::h2o.getFrame(model_glrm@model$representation_name))
## Found more than one class "textConnection" in cache; using the first, from namespace 'RJSONIO'
## Also defined by 'BiocGenerics'
## Found more than one class "textConnection" in cache; using the first, from namespace 'RJSONIO'
## Also defined by 'BiocGenerics'
# Dimension of X_matrix
dim(X_matrix)
## [1] 500 2
# First records of X_matrix
head(X_matrix)
## Arch1 Arch2
## 1: -0.0247 0.198
## 2: 0.5459 0.459
## 3: -0.0673 -0.597
## 4: -0.5037 0.443
## 5: -0.1740 -0.156
## 6: 0.5486 0.279
# Plot the records in the new two dimensional space
ggplot(as.data.table(X_matrix), aes(x= Arch1, y = Arch2, color = fashion_mnist$label)) +
ggtitle("Fashion Mnist GLRM Archetypes") +
geom_text(aes(label = fashion_mnist$label)) +
theme(legend.position="none")
# Store the label of each record and compute the centroids
X_matrix[, label := as.numeric(fashion_mnist$label)]
X_matrix[, mean_x := mean(Arch1), by = label]
X_matrix[, mean_y := mean(Arch2), by = label]
# Get one record per label and create a vector with class names
X_mean <- unique(X_matrix, by = "label")
label_names <- c("T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
"Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"
)
# Plot the centroids
ggplot(X_mean, aes(x = mean_x, y = mean_y, color = as.factor(label))) +
ggtitle("Fashion Mnist GLRM class centroids") +
geom_text(aes(label = label_names[label])) +
theme(legend.position="none")
makeNA <- function(x) {
vecNA <- sort(unique(sample(1:length(x), round(0.225*length(x)), replace=TRUE)))
x[vecNA] <- NA
return(x)
}
fashion_mnist_miss <- fashion_mnist %>%
select(-label) %>%
apply(1, FUN=makeNA)
# Store the input data in h2o
fashion_mnist_miss.hex <- h2o::as.h2o(fashion_mnist_miss, "fashion_mnist_miss.hex")
##
|
| | 0%
|
|======================================================================| 100%
# Build a GLRM model
model_glrm <- h2o::h2o.glrm(training_frame = fashion_mnist_miss.hex, transform="NORMALIZE",
k=2, max_iterations=100
)
##
|
| | 0%
|
|= | 1%
|
|=================== | 27%
|
|=================================== | 50%
|
|=================================================== | 73%
|
|=================================================================== | 95%
|
|======================================================================| 100%
# Impute missing values
fashion_pred <- h2o::h2o.predict(model_glrm, fashion_mnist_miss.hex)
##
|
| | 0%
|
|======================================================================| 100%
# Observe the statistics of the first 5 pixels
summary(fashion_pred[, 1:5])
## Warning in summary.H2OFrame(fashion_pred[, 1:5]): Approximated quantiles
## computed! If you are interested in exact quantiles, please pass the
## `exact_quantiles=TRUE` parameter.
## reconstr_V1 reconstr_V2 reconstr_V3
## Min. :-0.3249504 Min. :-0.3457294 Min. :-0.186293
## 1st Qu.:-0.2372956 1st Qu.:-0.2526312 1st Qu.:-0.136938
## Median : 0.0287444 Median : 0.0295671 Median : 0.011318
## Mean : 0.0007149 Mean : 0.0001298 Mean :-0.004211
## 3rd Qu.: 0.2117430 3rd Qu.: 0.2241115 3rd Qu.: 0.113663
## Max. : 0.3585177 Max. : 0.3801826 Max. : 0.196304
## reconstr_V4 reconstr_V5
## Min. :-0.501134 Min. :-0.179736
## 1st Qu.:-0.364382 1st Qu.:-0.130077
## Median : 0.050674 Median : 0.022016
## Mean : 0.007238 Mean : 0.006123
## 3rd Qu.: 0.336974 3rd Qu.: 0.126495
## Max. : 0.565161 Max. : 0.209747
# Get the starting timestamp
time_start <- proc.time()
# Train the random forest
rf_model <- randomForest::randomForest(x = fashion_mnist[, -1], y = fashion_mnist$label, ntree = 20)
# Get the end timestamp
time_end <- timetaken(time_start)
# Show the error and the time
rf_model$err.rate[20]
## [1] 0.302
time_end
## [1] "0.440s elapsed (0.440s cpu)"
# Get the starting timestamp
# time_start <- proc.time()
# Train the random forest
# rf_model <- randomForest(x = train_x, y = train_y, ntree = 500)
# Get the end timestamp
# time_end <- timetaken(time_start)
# Show the error and the time
# rf_model$err.rate[500]
# time_end
Chapter 1 - Introduction
Introduction:
Inline Functions with cppFunction:
int x = 37 ; return x ; double res = x + y ; return res ; res <- x + y res Debugging:
// Some values int x = 42 ; // Printing a message to the R console Rprintf( "some message in the console, x=%d\\n", x ) ; // Return some int // A simple error message if( x < 0 ) stop( "sorry x should be positive" ) ; // A formatted error message if( x > 20 ) stop( "x is too big (x=%d)", x ) ; // Return some int Example code includes:
# Load microbenchmark
library(microbenchmark)
library(Rcpp)
# Define the function sum_loop
sum_loop <- function(x) {
result <- 0
for (i in x) result <- result + i
result
}
x <- rnorm(100000)
# Check for equality
all.equal(sum_loop(x), sum(x))
# Compare the performance
microbenchmark(sum_loop = sum_loop(x), R_sum = sum(x))
# Evaluate 2 + 2 in C++
x <- evalCpp("2+2")
# Evaluate 2 + 2 in R
y <- 2+2
# Storage modes of x and y
storage.mode(x)
storage.mode(y)
# Change the C++ expression so that it returns a double
z <- evalCpp("2.0 + 2")
# Evaluate 17 / 2 in C++
evalCpp("17/2")
# Cast 17 to a double and divide by 2
evalCpp("(double)17/2")
# Cast 56.3 to an int
evalCpp("(int)56.3")
# Define the function the_answer()
cppFunction('
int the_answer() {
return 42 ;
}
')
# Check the_answer() returns the integer 42
the_answer() == 42L
# Define the function euclidean_distance()
cppFunction('
double euclidean_distance(double x, double y) {
return sqrt(x*x + y*y) ;
}
')
# Calculate the euclidean distance
euclidean_distance(1.5, 2.5)
# Define the function add()
cppFunction('
int add(int x, int y) {
int res = x + y ;
Rprintf("** %d + %d = %d\\n", x, y, res) ;
return res ;
}
')
# Call add() to print THE answer
add(40, 2)
cppFunction('
// adds x and y, but only if they are positive
int add_positive_numbers(int x, int y) {
// if x is negative, stop
if( x < 0 ) stop("x is negative") ;
// if y is negative, stop
if( y < 0 ) stop("y is negative") ;
return x + y ;
}
')
# Call the function with positive numbers
add_positive_numbers(2, 3)
# Call the function with a negative number
add_positive_numbers(-2, 3)
Chapter 2 - Functions and Control Flow
C++ Functions Belong to C++ Files:
return 2*x ; Writing Functions in C++:
if( x < 0 ){ Rprintf( "x is negative" ) ; } else if( x == 0 ){ Rprintf( "x is zero" ) ; } else if( x > 0 ){ Rprintf( "x is positive" ) ; } else { Rprintf( "x is not a number" ) ; } For Loops:
// some code using i if( n < 0 ) { stop( "n must be positive, I see n=%d", n ) ;} int result = 0 ; for( int i=0; i<n; i++){ if( i == 13 ){ Rprintf( "I cannot handle that, I am superstitious" ) ; break ; } result = result + (i+1) ; } return result ; While Loops:
body increment body Example code includes:
# file should be included as 'script.cpp')
# file called as sourceCpp('script.cpp')
#include <Rcpp.h>
using namespace Rcpp ;
// Export the function to R
//[[Rcpp::export]]
double twice(double x) {
// Fix the syntax error
return x+x;
}
// Include the Rcpp.h header
#include <Rcpp.h>
// Use the Rcpp namespace
using namespace Rcpp;
// [[Rcpp::export]]
int the_answer() {
// Return 42
return 42;
}
/*** R
# Call the_answer() to check you get the right result
the_answer()
*/
#include <Rcpp.h>
using namespace Rcpp;
// Make square() accept and return a double
double square(double x) {
// Return x times x
return x*x ;
}
// [[Rcpp::export]]
double dist(double x, double y) {
// Change this to use square()
return sqrt(square(x) + square(y));
}
#include <Rcpp.h>
using namespace Rcpp;
double square(double x) {
return x * x ;
}
// [[Rcpp::export]]
double dist(double x, double y) {
return sqrt(square(x) + square(y));
}
// Start the Rcpp R comment block
/*** R
# Call dist() to the point (3, 4)
dist(3, 4)
# Close the Rcpp R comment block
*/
#include <Rcpp.h>
using namespace Rcpp ;
// [[Rcpp::export]]
double absolute(double x) {
// Test for x greater than zero
if(x > 0) {
// Return x
return x;
// Otherwise
} else {
// Return negative x
return -x;
}
}
/*** R
absolute(pi)
absolute(-3)
*/
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
double sqrt_approx(double value, int n) {
// Initialize x to be one
double x = 1;
// Specify the for loop
for(int i = 0; i < n; i++) {
x = (x + value / x) / 2.0;
}
return x;
}
/*** R
sqrt_approx(2, 10)
*/
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
List sqrt_approx(double value, int n, double threshold) {
double x = 1.0;
double previous = x;
bool is_good_enough = false;
int i;
for(i = 0; i < n; i++) {
previous = x;
x = (x + value / x) / 2.0;
is_good_enough = fabs(previous - x) < threshold;
// If the solution is good enough, then "break"
if(is_good_enough) break;
}
return List::create(_["i"] = i , _["x"] = x);
}
/*** R
sqrt_approx(2, 1000, 0.1)
*/
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
double sqrt_approx(double value, double threshold) {
double x = 1.0;
double previous = x;
bool is_good_enough = false;
// Specify the while loop
while(is_good_enough == false) {
previous = x;
x = (x + value / x) / 2.0;
is_good_enough = fabs(x - previous) < threshold;
}
return x ;
}
/*** R
sqrt_approx(2, 0.00001)
*/
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
double sqrt_approx(double value, double threshold) {
double x = 1.0;
double previous = x;
bool is_good_enough = false;
// Initiate do while loop
do {
previous = x;
x = (x + value / x) / 2.0;
is_good_enough = fabs(x - previous) < threshold;
// Specify while condition
} while (is_good_enough == false);
return x;
}
/*** R
sqrt_approx(2, 0.00001)
*/
Chapter 3 - Vector Classes
Rcpp Classes and Vectors:
// manipulate x[i] Creating Vectors:
// create a new numeric vector of size n NumericVector x(n) ; // manipulate it for( int i=0; i<n; i++){ x[i] = 1 ; } return x ; // clone x into y NumericVector y = clone(x) ; for( int i=0; i< y.size(); i++){ if( y[i] < 0 ) y[i] = 0 ; } return y ; Weighted Mean:
total_xw <- 0 total_w <- 0 for( i in seq_along(x)){ total_xw <- total_xw + x[i]*w[i] total_w <- total_w + w[i] } total_xw / total_w double total_xw = 0.0 ; double total_w = 0.0 ; int n = ___ ; for( ___ ; ___ ; ___ ){ // accumulate into total_xw and total_w } return total_xw / total_w ; Vectors From the STL:
Example code includes:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
double first_plus_last(NumericVector x) {
// The size of x
int n = x.size();
// The first element of x
double first = x[0];
// The last element of x
double last = x[n-1];
return first + last;
}
/*** R
x <- c(6, 28, 496, 8128)
first_plus_last(x)
# Does the function give the same answer as R?
all.equal(first_plus_last(x), x[1] + x[4])
*/
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
double sum_cpp(NumericVector x) {
// The size of x
int n = x.size();
// Initialize the result
double result = 0;
// Complete the loop specification
for(int i = 0; i<n; i++) {
// Add the next value
result = result + x[i];
}
return result;
}
/*** R
set.seed(42)
x <- rnorm(1e6)
sum_cpp(x)
# Does the function give the same answer as R's sum() function?
all.equal(sum_cpp(x), sum(x))
*/
#include <Rcpp.h>
using namespace Rcpp;
// Set the return type to IntegerVector
// [[Rcpp::export]]
IntegerVector seq_cpp(int lo, int hi) {
int n = hi - lo + 1;
// Create a new integer vector, sequence, of size n
IntegerVector sequence(n);
for(int i = 0; i < n; i++) {
// Set the ith element of sequence to lo plus i
sequence[i] = lo + i;
}
return sequence;
}
/*** R
lo <- -2
hi <- 5
seq_cpp(lo, hi)
# Does it give the same answer as R's seq() function?
all.equal(seq_cpp(lo, hi), seq(lo, hi))
*/
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
List create_vectors() {
// Create an unnamed character vector
CharacterVector polygons = CharacterVector::create("triangle", "square", "pentagon");
// Create a named integer vector
IntegerVector mersenne_primes = IntegerVector::create(_["first"] = 3, _["second"] = 7, _["third"] = 31);
// Create a named list
List both = List::create(_["polygons"] = polygons, _["mersenne_primes"] = mersenne_primes);
return both;
}
/*** R
create_vectors()
*/
# Unlike R, C++ uses a copy by reference system, meaning that if you copy a variable then make changes to the copy, the changes will also take place in the original.
# To prevent this behavior, you have to use the clone() function to copy the underlying data from the original variable into the new variable
# The syntax is y = clone(x). In this exercise, we have defined two functions for you:
# change_negatives_to_zero(): Takes a numeric vector, modifies by replacing negative numbers with zero, then returns both the original vector and the copy.
# change_negatives_to_zero_with_cloning(): Does the same thing as above, but clones the original vector before modifying it.
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
List change_negatives_to_zero(NumericVector the_original) {
// Set the copy to the original
NumericVector the_copy = the_original;
int n = the_original.size();
for(int i = 0; i < n; i++) {
if(the_copy[i] < 0) the_copy[i] = 0;
}
return List::create(_["the_original"] = the_original, _["the_copy"] = the_copy);
}
// [[Rcpp::export]]
List change_negatives_to_zero_with_cloning(NumericVector the_original) {
// Clone the original to make the copy
NumericVector the_copy = clone(the_original);
int n = the_original.size();
for(int i = 0; i < n; i++) {
if(the_copy[i] < 0) the_copy[i] = 0;
}
return List::create(_["the_original"] = the_original, _["the_copy"] = the_copy);
}
/*** R
x <- c(0, -4, 1, -2, 2, 4, -3, -1, 3)
change_negatives_to_zero_with_cloning(x)
change_negatives_to_zero(x)
*/
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
double weighted_mean_cpp(NumericVector x, NumericVector w) {
// Initialize these to zero
double total_w = 0.0;
double total_xw = 0.0;
// Set n to the size of x
int n = x.size();
// Specify the for loop arguments
for(int i = 0; i<n; i++) {
// Add ith weight
total_w += w[i];
// Add the ith data value times the ith weight
total_xw += w[i]*x[i];
}
// Return the total product divided by the total weight
return total_xw / total_w;
}
/*** R
x <- c(0, 1, 3, 6, 2, 7, 13, 20, 12, 21, 11)
w <- 1 / seq_along(x)
weighted_mean_cpp(x, w)
# Does the function give the same results as R's weighted.mean() function?
all.equal(weighted_mean_cpp(x, w), weighted.mean(x, w))
*/
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
double weighted_mean_cpp(NumericVector x, NumericVector w) {
double total_w = 0;
double total_xw = 0;
int n = x.size();
for(int i = 0; i < n; i++) {
// If the ith element of x or w is NA then return NA
if (NumericVector::is_na(x[i]) | NumericVector::is_na(w[i])) return NumericVector::get_na();
total_w += w[i];
total_xw += x[i] * w[i];
}
return total_xw / total_w;
}
/*** R
x <- c(0, 1, 3, 6, 2, 7, 13, NA, 12, 21, 11)
w <- 1 / seq_along(x)
weighted_mean_cpp(x, w)
*/
NumericVector bad_select_positive_values_cpp(NumericVector x) {
NumericVector positive_x(0);
for(int i = 0; i < x.size(); i++) {
if(x[i] > 0) {
positive_x.push_back(x[i]);
}
}
return positive_x;
}
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector good_select_positive_values_cpp(NumericVector x) {
int n_elements = x.size();
int n_positive_elements = 0;
// Calculate the size of the output
for(int i = 0; i < n_elements; i++) {
// If the ith element of x is positive
if(x[i] > 0) {
// Add 1 to n_positive_elements
n_positive_elements++;
}
}
// Allocate a vector of size n_positive_elements
NumericVector positive_x(n_positive_elements);
// Fill the vector
int j = 0;
for(int i = 0; i < n_elements; i++) {
// If the ith element of x is positive
if(x[i] > 0) {
// Set the jth element of positive_x to the ith element of x
positive_x[j] = x[i];
// Add 1 to j
j++;
}
}
return positive_x;
}
/*** R
set.seed(42)
x <- rnorm(1e4)
# Does it give the same answer as R?
all.equal(good_select_positive_values_cpp(x), x[x > 0])
# Which is faster?
microbenchmark(
bad_cpp = bad_select_positive_values_cpp(x),
good_cpp = good_select_positive_values_cpp(x)
)
*/
# The standard template library (stl) is a C++ library containing flexible algorithms and data structures
# For example, the double vector from the stl is like a "native C++" equivalent of Rcpp's NumericVector
# The following code creates a standard double vector named x with ten elements
std::vector<double> x(10);
# Usually it makes more sense to stick with Rcpp vector types because it gives you access to many convenience methods that work like their R equivalents, including mean(), round(), and abs()
# However, the stl vectors have an advantage that they can dynamically change size without paying for data copy every time
#include <Rcpp.h>
using namespace Rcpp;
// Set the return type to a standard double vector
// [[Rcpp::export]]
std::vector<double> select_positive_values_std(NumericVector x) {
int n = x.size();
// Create positive_x, a standard double vector
std::vector<double> positive_x(0);
for(int i = 0; i < n; i++) {
if(x[i] > 0) {
// Append the ith element of x to positive_x
positive_x.push_back(x[i]);
}
}
return positive_x;
}
/*** R
set.seed(42)
x <- rnorm(1e6)
# Does it give the same answer as R?
all.equal(select_positive_values_std(x), x[x > 0])
# Which is faster?
microbenchmark(
good_cpp = good_select_positive_values_cpp(x),
std = select_positive_values_std(x)
)
*/
Chapter 4 - Case Studies
Random Number Generation:
// keep generating d until it gets positive double d ; do { d = ... ; } while( d < 0 ) ; NumericVector res(n) ; for( int i=0; i<n; i++){ // find which component to use ... // simulate using the mean and sd from the selected component ... } return res ; Rolling Operations:
total <- 0 n <- 0 for( i in seq_along(x) ){ if( is.na(x[i]) ){ x[i] <- total / n } else { total <- x[i] + total n <- n + 1 } } Auto-Regressive Model:
x <- epsilon <- rnorm(n, sd = sd) np <- length(phi) for( i in seq(np+1, n)){ x[i] <- sum(x[seq(i-1, i-np)] * phi) + epsilon[i] } x x[i] = R::rnorm(___) ; double value = rnorm(___) ; // inner loop for( ___ ; ___ ; ___ ){ value += ___ ; } x[i] = value ; epsilon <- rnorm(n, sd = sd) x <- numeric(n) nq <- length(theta) for( i in seq(nq+1, n)){ x[i] <- sum(epsilon[seq(i-1, i-nq)] * theta) + epsilon[i] } x int nq = theta.size() ; // generate the noise vector at once // using the Rcpp::rnorm function, similar to the R function NumericVector eps = Rcpp::rnorm(n, 0.0, sd) ; // init the output vector of size n with all 0.0 NumericVector x(___) ; // start filling the values at index nq + 1 for( int i=nq+1; i<n; i++){ ____ } return x ; Wrap Up:
return x + y ; return 2.0 * x; Example code includes:
# When you write R code, it usually makes sense to generate random numbers in a vectorized fashion
# When you are in C++ however, you are allowed (even by your guilty conscience) to use loops and process the data element by element
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector positive_rnorm(int n, double mean, double sd) {
// Specify out as a numeric vector of size n
NumericVector out(n);
// This loops over the elements of out
for(int i = 0; i < n; i++) {
// This loop keeps trying to generate a value
do {
// Call Rs rnorm()
out[i] = R::rnorm(mean, sd);
// While the number is negative, keep trying
} while(out[i] < 0);
}
return out;
}
/*** R
positive_rnorm(10, 2, 2)
*/
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
int choose_component(NumericVector weights, double total_weight) {
// Generate a uniform random number from 0 to total_weight
double x = R::runif(0, total_weight);
// Remove the jth weight from x until x is small enough
int j = 0;
while(x >= weights[j]) {
// Subtract jth element of weights from x
x -= weights[j];
j++;
}
return j;
}
/*** R
weights <- c(0.3, 0.7)
# Randomly choose a component 5 times
replicate(5, choose_component(weights, sum(weights)))
*/
#include <Rcpp.h>
using namespace Rcpp;
// From previous exercise; do not modify
// [[Rcpp::export]]
int choose_component(NumericVector weights, double total_weight) {
double x = R::runif(0, total_weight);
int j = 0;
while(x >= weights[j]) {
x -= weights[j];
j++;
}
return j;
}
// [[Rcpp::export]]
NumericVector rmix(int n, NumericVector weights, NumericVector means, NumericVector sds) {
// Check that weights and means have the same size
int d = weights.size();
if(means.size() != d) {
stop("means size != weights size");
}
// Do the same for the weights and std devs
if(sds.size() != d) {
stop("sds size != weights size");
}
// Calculate the total weight
double total_weight = 0.0;
for (int i=0; i<d; i++) {
total_weight += weights[i];
};
// Create the output vector
NumericVector res(n);
// Fill the vector
for(int i = 0; i < n; i++) {
// Choose a component
int j = choose_component(weights, total_weight);
// Simulate from the chosen component
res[i] = R::rnorm(means[j], sds[j]);
}
return res;
}
/*** R
weights <- c(0.3, 0.7)
means <- c(2, 4)
sds <- c(2, 4)
rmix(10, weights, means, sds)
*/
# Complete the definition of rollmean3()
rollmean3 <- function(x, window = 3) {
# Add the first window elements of x
initial_total <- sum(head(x, window))
# The elements to add at each iteration
lasts <- tail(x, - window)
# The elements to remove
firsts <- head(x, - window)
# Take the initial total and add the
# cumulative sum of lasts minus firsts
other_totals <- initial_total + cumsum(lasts - firsts)
# Build the output vector
c(rep(NA, window - 1), # leading NA
initial_total / window, # initial mean
other_totals / window # other means
)
}
# From previous step; do not modify
rollmean3 <- function(x, window = 3) {
initial_total <- sum(head(x, window))
lasts <- tail(x, - window)
firsts <- head(x, - window)
other_totals <- initial_total + cumsum(lasts - firsts)
c(rep(NA, window - 1), initial_total / window, other_totals / window)
}
# This checks rollmean1() and rollmean2() give the same result
all.equal(rollmean1(x), rollmean2(x))
# This checks rollmean1() and rollmean3() give the same result
all.equal(rollmean1(x), rollmean3(x))
# Benchmark the performance
microbenchmark(rollmean1(x), rollmean2(x), rollmean3(x), times = 5)
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector rollmean4(NumericVector x, int window) {
int n = x.size();
// Set res as a NumericVector of NAs with length n
NumericVector res(n, NumericVector::get_na());
// Sum the first window worth of values of x
double total = 0.0;
for(int i = 0; i < window; i++) {
total += x[i];
}
// Treat the first case seperately
res[window - 1] = total / window;
// Iteratively update the total and recalculate the mean
for(int i = window; i < n; i++) {
// Remove the (i - window)th case, and add the ith case
total += - x[i-window] + x[i];
// Calculate the mean at the ith position
res[i] = total / window;
}
return res;
}
/*** R
# Compare rollmean2, rollmean3 and rollmean4
set.seed(42)
x <- rnorm(1e4)
microbenchmark(rollmean2(x, 4), rollmean3(x, 4), rollmean4(x, 4), times = 5)
*/
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector na_locf2(NumericVector x) {
// Initialize to NA
double current = NumericVector::get_na();
int n = x.size();
NumericVector res = no_init(n);
for(int i = 0; i < n; i++) {
// If ith value of x is NA
if(NumericVector::is_na(x[i])) {
// Set ith result as current
res[i] = current
} else {
// Set current as ith value of x
current = x[i];
res[i] = x[i]
}
}
return res ;
}
/*** R
library(microbenchmark)
set.seed(42)
x <- rnorm(1e5)
# Sprinkle some NA into x
x[sample(1e5, 100)] <- NA
microbenchmark(na_locf1(x), na_locf2(x), times = 5)
*/
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector na_meancf2(NumericVector x) {
double total_not_na = 0.0;
double n_not_na = 0.0;
NumericVector res = clone(x);
int n = x.size();
for(int i = 0; i < n; i++) {
// If ith value of x is NA
if(NumericVector::is_na(x[i])) {
// Set the ith result to the total of non-missing values
// divided by the number of non-missing values
res[i] = total_not_na / n_not_na;
} else {
// Add the ith value of x to the total of non-missing values
total_not_na += x[i];
// Add 1 to the number of missing values
n_not_na ++;
}
}
return res;
}
/*** R
library(microbenchmark)
set.seed(42)
x <- rnorm(1e5)
x[sample(1e5, 100)] <- NA
microbenchmark(na_meancf1(x), na_meancf2(x), times = 5)
*/
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector ar2(int n, double c, NumericVector phi, double eps) {
int p = phi.size();
NumericVector x(n);
// Loop from p to n
for(int i = p; i < n; i++) {
// Generate a random number from the normal distribution
double value = R::rnorm(c, eps);
// Loop from zero to p
for(int j = 0; j < p; j++) {
// Increase by the jth element of phi times
// the "i minus j minus 1"th element of x
value += phi[j] * x[i-j-1];
}
x[i] = value;
}
return x;
}
/*** R
d <- data.frame(x = 1:50, y = ar2(50, 10, c(1, -0.5), 1))
ggplot(d, aes(x, y)) +
geom_line()
*/
#include <Rcpp.h>
using namespace Rcpp ;
// [[Rcpp::export]]
NumericVector ma2( int n, double mu, NumericVector theta, double sd ){
int q = theta.size();
NumericVector x(n);
// Generate the noise vector
NumericVector eps = rnorm(n, 0.0, sd);
// Loop from q to n
for(int i = q; i < n; i++) {
// Value is mean plus noise
double value = mu + eps[i];
// Loop from zero to q
for(int j = 0; j < q; j++) {
// Increase by the jth element of theta times
// the "i minus j minus 1"th element of eps
value += theta[j] * eps[i - j - 1];
}
// Set ith element of x to value
x[i] = value;
}
return x ;
}
/*** R
d <- data.frame(x = 1:50, y = ma2(50, 10, c(1, -0.5), 1))
ggplot(d, aes(x, y)) +
geom_line()
*/
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector arma(int n, double mu, NumericVector phi, NumericVector theta, double sd) {
int p = phi.size();
int q = theta.size();
NumericVector x(n);
// Generate the noise vector
NumericVector eps = rnorm(n, 0.0, sd);
// Start at the max of p and q plus 1
int start = std::max(p, q) + 1;
// Loop i from start to n
for(int i = start; i < n; i++) {
// Value is mean plus noise
double value = mu + eps[i];
// The MA(q) part
for(int j = 0; j < q; j++) {
// Increase by the jth element of theta times
// the "i minus j minus 1"th element of eps
value += theta[j] * eps[i - j - 1];
}
// The AR(p) part
for(int j = 0; j < p; j++) {
// Increase by the jth element of phi times
// the "i minus j minus 1"th element of x
value += phi[j] * x[i - j - 1];
}
x[i] = value;
}
return x;
}
/*** R
d <- data.frame(x = 1:50, y = arma(50, 10, c(1, -0.5), c(1, -0.5), 1))
ggplot(d, aes(x, y)) +
geom_line()
*/
Chapter 1 - GLMs
Before Starting:
Introduction to Generalized Linear Models (GLM):
Poisson GLM:
Example code includes:
dragonflies <- readr::read_csv("./RInputFiles/data1.csv")
## Parsed with column specification:
## cols(
## abundance = col_double(),
## feeding_events = col_double(),
## area = col_double(),
## stream_flow = col_double(),
## time = col_character(),
## season = col_character()
## )
orchids <- readr::read_csv("./RInputFiles/lme_data.csv")
## Parsed with column specification:
## cols(
## site = col_character(),
## abundance = col_double(),
## richness = col_double(),
## humidity = col_double(),
## tree_age = col_double()
## )
str(dragonflies)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 150 obs. of 6 variables:
## $ abundance : num 16 32 88 140 62 143 121 69 100 68 ...
## $ feeding_events: num 69 153 408 691 355 678 617 334 534 362 ...
## $ area : num 3.67 4.57 5.1 3.19 3.83 ...
## $ stream_flow : num 1.288 1.279 0.596 1.5 1.165 ...
## $ time : chr "day" "night" "day" "day" ...
## $ season : chr "summer" "autumn" "summer" "summer" ...
## - attr(*, "spec")=
## .. cols(
## .. abundance = col_double(),
## .. feeding_events = col_double(),
## .. area = col_double(),
## .. stream_flow = col_double(),
## .. time = col_character(),
## .. season = col_character()
## .. )
str(orchids)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 160 obs. of 5 variables:
## $ site : chr "a" "a" "a" "a" ...
## $ abundance: num 11 10 13 11 10 10 9 12 11 10 ...
## $ richness : num 7 4 4 4 4 3 4 3 3 3 ...
## $ humidity : num 59.5 70.4 73.4 53.8 66.8 57.9 78.7 81 60.1 73.3 ...
## $ tree_age : num 14 12 9 14 9 11 13 15 7 19 ...
## - attr(*, "spec")=
## .. cols(
## .. site = col_character(),
## .. abundance = col_double(),
## .. richness = col_double(),
## .. humidity = col_double(),
## .. tree_age = col_double()
## .. )
# Draw histogram
ggplot(dragonflies) +
geom_histogram(aes(x = feeding_events))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Draw scatterplot
ggplot(dragonflies) +
geom_point(aes(x = stream_flow, y = feeding_events))
# Apply a GLM
gaussian_glm <- glm(feeding_events ~ stream_flow, data = dragonflies, family = "gaussian")
# Set up a data frame for predictions
pred_df <- data.frame(stream_flow = seq(from = 1, to = 5, length = 10))
# Generate predictions
pred_df$predicted <- predict(gaussian_glm, pred_df)
# Look at the data frame
pred_df
## stream_flow predicted
## 1 1.00 321.08
## 2 1.44 285.11
## 3 1.89 249.13
## 4 2.33 213.15
## 5 2.78 177.17
## 6 3.22 141.20
## 7 3.67 105.22
## 8 4.11 69.24
## 9 4.56 33.27
## 10 5.00 -2.71
# Add model line to plot
ggplot(dragonflies) +
geom_point(aes(x = stream_flow, y = feeding_events)) +
geom_line(aes(x = stream_flow, y = predicted), data = pred_df)
# Generate data frame of residuals and fitted values
diag <- data.frame(residuals = resid(gaussian_glm), fitted = fitted(gaussian_glm))
# Visualize residuals vs fitted values
ggplot(diag) +
geom_point(aes(x = fitted, y = residuals))
# Apply Poisson GLM
poisson_glm <- glm(feeding_events ~ stream_flow, data = dragonflies, family = "poisson")
# Set up a data frame for predictions
pred_df <- data.frame(stream_flow = seq(from = 1, to = 5, length = 10))
# Generate predictions
pred_df$predicted <- predict(poisson_glm, pred_df, type = "response")
# Add line reprsenting Poisson GLM
ggplot(dragonflies) +
geom_point(aes(x = stream_flow, y = feeding_events)) +
geom_line(aes(x = stream_flow, y = predicted), data = pred_df)
dispersion <- function(model, modeltype = "p"){
A <- sum(resid(model, type = "pearson") ^ 2)
if (modeltype %in% c("poisson", "quasipoisson")) {
B <- length(resid(model)) - length(coef(model))
}
if (modeltype %in% c("nb", "negativebinomial")) {
B <- length(resid(model)) - (length(coef(model)) + 1)
}
DISP <- A / B
return(DISP)
}
# Generate data frame of residuals and fitted values
diag <- data.frame(fitted=fitted(poisson_glm), residuals=resid(poisson_glm))
# Visualize residuals vs fitted values
ggplot(diag) +
geom_point(aes(x=fitted, y=residuals))
# Calculate the dispersion of the model
dispersion(poisson_glm, modeltype="poisson")
## [1] 97.6
Chapter 2 - Extending GLMs
Adding Factors and Interactions:
Adding an Offset to the Model:
Negative Binomial Model and Model Selection:
Model Selection and Visualization:
Example code includes:
pr_fac <- function(model, plotfactor, xlabel = "", modeltype = "lm"){
if(modeltype %in% c("linear", "lm", "poisson", "p", "quasipossion", "qp")){
skiprows <- unique(summary(model)$na.action)
}
if(modeltype %in% c("negativebinomial", "nb")){
skiprows <- unique(summary(model)[21])
}
if(length(skiprows) > 0) {Factor <- plotfactor[-skiprows]}
if(length(skiprows) == 0) {Factor <- plotfactor}
plot1 <- data.frame(PR = resid(model, type = "pearson"),Factor)
if(is.factor(Factor) == FALSE){
PR.plot1 <- ggplot(plot1) +
geom_point(aes(y = PR, x = Factor)) +
geom_hline(yintercept = 0, linetype = 'dashed', col = 'red')+
ylab("Residuals") + xlab(xlabel) + theme_bw(18)
}
if(is.factor(Factor) == TRUE){
PR.plot1 <- ggplot(plot1) +
geom_boxplot(aes(y = PR, x = Factor)) +
geom_hline(yintercept = 0, linetype = 'dashed', col = 'red') +
ylab("Residuals") + xlab(xlabel) +
theme_bw(18)
}
return(PR.plot1)
}
# Compare residuals across factor levels
pr_fac(poisson_glm, dragonflies$time, xlabel = "time", modeltype = "poisson")
# Add time as a factor, including an interaction
poisson_glm_factor <- glm(feeding_events ~ stream_flow * time, data = dragonflies, family = "poisson")
# Generate predicted values of feeding_events
pred_df <- expand.grid(stream_flow = seq(from = 1, to = 5, length = 10), time = c("day", "night"))
pred_df$predicted <- predict(poisson_glm_factor, pred_df, type = "response")
# Visualize predicted values of feeding events
ggplot(dragonflies) +
geom_point(aes(x = stream_flow, y = feeding_events)) +
geom_line(aes(x = stream_flow, y = predicted, col = time), data = pred_df)
# Generate data frame of residuals and fitted values
diag <- data.frame(residuals=resid(poisson_glm_factor), fitted=fitted(poisson_glm_factor))
# Visualize residuals vs fitted values
ggplot(diag) +
geom_point(aes(x=fitted, y=residuals))
# Calculate the dispersion of the model
dispersion(poisson_glm_factor, modeltype="poisson")
## [1] 86.8
# Create a column containing the natural log of area
dragonflies$logarea <- log(dragonflies$area)
# Apply Poisson GLM with interaction and offset
poisson_glm_offset <- glm(feeding_events ~ stream_flow * time + offset(logarea),
data = dragonflies, family = "poisson"
)
# Apply Negative Binomial GLM
neg_binom_glm <- MASS::glm.nb(feeding_events ~ stream_flow*time + offset(logarea), data=dragonflies)
# Use drop1 to determine which term(s) can be dropped
drop1(neg_binom_glm, test="Chisq")
## Single term deletions
##
## Model:
## feeding_events ~ stream_flow * time + offset(logarea)
## Df Deviance AIC LRT Pr(>Chi)
## <none> 163 1829
## stream_flow:time 1 164 1828 1.24 0.27
# Apply a new Negative Binomial GLM
neg_binom_glm_small <- MASS::glm.nb(feeding_events ~ stream_flow + time + offset(logarea), data=dragonflies)
# Calculate dispersion for each model
dispersion(neg_binom_glm, modeltype="nb")
## [1] 1.08
dispersion(neg_binom_glm_small, modeltype="nb")
## [1] 1.06
# Generate data frame of residuals and fitted values for neg_binom_glm
diag <- data.frame(fitted=fitted(neg_binom_glm), residuals=resid(neg_binom_glm))
# Visualize residuals vs fitted values for neg_binom_glm
ggplot(diag) +
geom_point(aes(x=fitted, y=residuals))
# Generate data frame of residuals and fitted values for neg_binom_glm
diag_small <- data.frame(residuals = resid(neg_binom_glm_small), fitted = fitted(neg_binom_glm_small))
# Visualize residuals vs fitted values for neg_binom_glm
ggplot(diag_small) +
geom_point(aes(x = fitted, y = residuals))
# Compare AIC scores
AIC(neg_binom_glm, neg_binom_glm_small)
## df AIC
## neg_binom_glm 5 1831
## neg_binom_glm_small 4 1830
# View the selected model
neg_binom_glm_small
##
## Call: MASS::glm.nb(formula = feeding_events ~ stream_flow + time +
## offset(logarea), data = dragonflies, init.theta = 1.837975779,
## link = log)
##
## Coefficients:
## (Intercept) stream_flow timenight
## 5.616 -0.871 -0.417
##
## Degrees of Freedom: 149 Total (i.e. Null); 147 Residual
## Null Deviance: 386
## Residual Deviance: 163 AIC: 1830
# Create data frame
pred_df <- expand.grid(stream_flow=seq(1, 5, length=5), time=c("day", "night"), logarea=log(6))
# Generate predicted values
pred_df$predicted <- predict(neg_binom_glm_small, newdata=pred_df, type="response")
# Visualize predicted values
ggplot(dragonflies) +
geom_point(aes(x = stream_flow, y = feeding_events)) +
geom_line(aes(x=stream_flow, y=predicted, color=time), data=pred_df)
# Extract fitted values
raw_fit <- predict(neg_binom_glm_small, pred_df, type = "link")
# Extract standard errors
raw_se <- predict(neg_binom_glm_small, pred_df, type = "link", se = TRUE)$se
# Generate predictions of upper and lower values
pred_df$upper <- exp(raw_fit + 1.96 * raw_se)
pred_df$lower <- exp(raw_fit - 1.96 * raw_se)
# Visualize the standard errors around the predicted values
ggplot(dragonflies) +
geom_point(aes(x = stream_flow, y = feeding_events)) +
geom_line(aes(x = stream_flow, y = predicted, col = time), data = pred_df) +
geom_line(aes(x = stream_flow, y = lower, col = time), linetype="dashed", data = pred_df) +
geom_line(aes(x = stream_flow, y = upper, col = time), linetype="dashed", data = pred_df)
Chapter 3 - Mixed Effects Model I
Mixed Effects Models:
Model Selection and Interpretation:
Visualizing a Random Intercept Model:
Example code includes:
# Create scatterplot of humidity and abundance
ggplot(orchids, aes(x=humidity, y=abundance, color=site)) +
geom_point()
# Apply GLM
linear_glm <- glm(abundance ~ humidity + site, data = orchids, family = "gaussian")
# Look at the output to see paramters for each site
coef(linear_glm)
## (Intercept) humidity siteb sitec sited sitee
## -0.702 0.177 2.940 -6.582 -6.492 -1.579
## sitef siteg siteh
## -5.607 -6.000 -5.336
# Apply random intercept model
random_int_model <- nlme::lme(abundance ~ humidity, random = ~1|site, data=orchids)
# Look at model output
random_int_model
## Linear mixed-effects model fit by REML
## Data: orchids
## Log-restricted-likelihood: -420
## Fixed: abundance ~ humidity
## (Intercept) humidity
## -4.375 0.179
##
## Random effects:
## Formula: ~1 | site
## (Intercept) Residual
## StdDev: 3.52 3.07
##
## Number of Observations: 160
## Number of Groups: 8
# Fit linear model using Generalized Least Squares
gls_model <- nlme::gls(abundance ~ humidity, data = orchids)
# Apply a random intercept model
random_int_model <- nlme::lme(abundance ~ humidity, random = ~1 | site, data = orchids, method = "REML")
# Apply likelihood ratio test to compare models
anova(gls_model, random_int_model)
## Model df AIC BIC logLik Test L.Ratio p-value
## gls_model 1 3 944 953 -469
## random_int_model 2 4 849 861 -420 1 vs 2 97.5 <.0001
# Print the model that fits better
random_int_model
## Linear mixed-effects model fit by REML
## Data: orchids
## Log-restricted-likelihood: -420
## Fixed: abundance ~ humidity
## (Intercept) humidity
## -4.375 0.179
##
## Random effects:
## Formula: ~1 | site
## (Intercept) Residual
## StdDev: 3.52 3.07
##
## Number of Observations: 160
## Number of Groups: 8
# Calculate estimate of variance for the random intercept
calculated_value <- 3.515514**2
calculated_value
## [1] 12.4
# Extract estimate of variance for the random intercept
extracted_value <- nlme::VarCorr(random_int_model)[1, 1]
extracted_value
## [1] "12.4"
# Create data frame for fixed component
pred_df.fixed <- data.frame(humidity = seq(from = 40, to = 75, length = 10))
# Generate population level predictions
pred_df.fixed$predicted <- predict(random_int_model, pred_df.fixed, level = 0)
# Visualize predicted values
ggplot(orchids) +
geom_point(aes(x = humidity, y = abundance, col = site)) +
geom_line(aes(x=humidity, y=predicted), data=pred_df.fixed, size=2)
# Create data frame for random component
pred_df.random <- expand.grid(humidity = seq(from = 40, to = 75, length = 10), site = unique(orchids$site))
# Generate within-site predictions
pred_df.random$random <- predict(random_int_model, newdata=pred_df.random, level=1)
# Visualize predicted values
ggplot(orchids) +
geom_point(aes(x = humidity, y = abundance, col = site)) +
geom_line(aes(x = humidity, y = predicted), size = 2, data = pred_df.fixed) +
geom_line(aes(x=humidity, y=random, col = site), data = pred_df.random)
Chapter 4 - Mixed Effects Models II
Random Intercept and Slope Models:
Model Selection and Interpretation:
Using Modeling as a Tool:
Wrap Up:
Example code includes:
# Apply random intercept and slope model
random_int_slope_model <- nlme::lme(abundance ~ humidity, random = ~1 + humidity | site, data=orchids)
# Look at model output
random_int_slope_model
## Linear mixed-effects model fit by REML
## Data: orchids
## Log-restricted-likelihood: -409
## Fixed: abundance ~ humidity
## (Intercept) humidity
## -4.141 0.176
##
## Random effects:
## Formula: ~1 + humidity | site
## Structure: General positive-definite, Log-Cholesky parametrization
## StdDev Corr
## (Intercept) 7.328 (Intr)
## humidity 0.139 -0.885
## Residual 2.721
##
## Number of Observations: 160
## Number of Groups: 8
# Create data frame for fixed component
pred_df.fixed <- data.frame(humidity = seq(from = 40, to = 75, length = 10))
# Generate population level predictions
pred_df.fixed$predicted <- predict(random_int_slope_model, pred_df.fixed, level = 0)
# Create data frame for random component
pred_df.random <- expand.grid(humidity = seq(from = 40, to = 75, length = 10), site = unique(orchids$site))
# Generate within-site predictions
pred_df.random$random <- predict(random_int_slope_model, pred_df.random, level = 1)
# Visualize population level predictions and within-site predictions of abundance
ggplot(orchids) +
geom_point(aes(x = humidity, y = abundance, col = site)) +
geom_line(aes(x=humidity, y=predicted), size=2, data=pred_df.fixed)
# Visualize population level predictions and within-site predictions of abundance
ggplot(orchids) +
geom_point(aes(x = humidity, y = abundance, col = site)) +
geom_line(aes(x = humidity, y = predicted), size = 2, data = pred_df.fixed) +
geom_line(aes(x=humidity, y=random, col = site), data = pred_df.random)
# Apply a maximum likelihood ratio test
anova(random_int_model, random_int_slope_model)
## Model df AIC BIC logLik Test L.Ratio p-value
## random_int_model 1 4 849 861 -420
## random_int_slope_model 2 6 829 848 -409 1 vs 2 23.1 <.0001
# Calculate the corrected p-value
LR <- ((-420.2667) - (-408.7254)) * -2
((1 - pchisq(LR, 1)) + (1 - pchisq(LR, 2))) * 0.5
## [1] 5.64e-06
# Print the model that has more parameters
fewer_parameters <- random_int_model
fewer_parameters
## Linear mixed-effects model fit by REML
## Data: orchids
## Log-restricted-likelihood: -420
## Fixed: abundance ~ humidity
## (Intercept) humidity
## -4.375 0.179
##
## Random effects:
## Formula: ~1 | site
## (Intercept) Residual
## StdDev: 3.52 3.07
##
## Number of Observations: 160
## Number of Groups: 8
# Print the model that has the better AIC value
better_aic_value <- random_int_slope_model
better_aic_value
## Linear mixed-effects model fit by REML
## Data: orchids
## Log-restricted-likelihood: -409
## Fixed: abundance ~ humidity
## (Intercept) humidity
## -4.141 0.176
##
## Random effects:
## Formula: ~1 + humidity | site
## Structure: General positive-definite, Log-Cholesky parametrization
## StdDev Corr
## (Intercept) 7.328 (Intr)
## humidity 0.139 -0.885
## Residual 2.721
##
## Number of Observations: 160
## Number of Groups: 8
# View the model output
random_int_slope_model
## Linear mixed-effects model fit by REML
## Data: orchids
## Log-restricted-likelihood: -409
## Fixed: abundance ~ humidity
## (Intercept) humidity
## -4.141 0.176
##
## Random effects:
## Formula: ~1 + humidity | site
## Structure: General positive-definite, Log-Cholesky parametrization
## StdDev Corr
## (Intercept) 7.328 (Intr)
## humidity 0.139 -0.885
## Residual 2.721
##
## Number of Observations: 160
## Number of Groups: 8
# Calculate the estimated variance of random intercept
variance_int <- 7.3277203**2
# Calculate the estimated variance of random slope
variance_slope <- 0.1387053**2
# Print the higher estimate
variance_int
## [1] 53.7
myData <- data.frame(X=1:160,
y=c(7, 4, 4, 4, 4, 3, 4, 3, 3, 3, 7, 3, 3, 3, 3, 3, 4, 4, 6, 3, 2, 3, 3, 7, 8, 2, 3, 4, 9, 4, 3, 3, 4, 6, 4, 3, 3, 4, 7, 5, 2, 1, 2, 2, 2, 3, 0, 3, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 2, 3, 1, 3, 1, 2, 2, 0, 2, 0, 2, 3, 1, 2, 1, 2, 1, 1, 2, 2, 10, 3, 0, 2, 2, 7, 4, 0, 2, 2, 5, 2, 1, 2, 2, 5, 4, 1, 3, 2, 2, 1, 4, 2, 2, 1, 2, 2, 2, 3, 3, 2, 3, 2, 4, 2, 2, 4, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 0, 2, 2, 2, 1, 0, 1, 3, 2, 2, 2, 3, 2, 0, 1, 3, 3, 2, 1, 2, 3, 3, 3, 0, 2, 2, 2, 2, 1),
x=c(59.5, 70.4, 73.4, 53.8, 66.8, 57.9, 78.7, 81, 60.1, 73.3, 69.3, 60.1, 63.4, 48.1, 65.4, 58.2, 68.9, 75.1, 58.3, 69.4, 48.1, 51.8, 53.6, 59.4, 73.3, 43.4, 45.2, 58.5, 52.3, 81.5, 39.9, 48.1, 50.8, 49.8, 82.5, 44.3, 56.1, 57.6, 62.9, 78, 69.8, 45.1, 62, 64.8, 58.3, 68.6, 38.6, 64.5, 57, 65.9, 71.3, 33.8, 72.7, 71.1, 53.7, 73.4, 46.2, 63.4, 62.9, 53, 42.3, 60.3, 55.4, 46.3, 68.1, 45.5, 53.7, 47.1, 43, 58.7, 48, 68.9, 54.7, 58.2, 66.4, 42.5, 59.1, 50.6, 50.2, 70.8, 48.8, 65, 52.7, 43.7, 59.5, 42.1, 74.7, 61.6, 45.6, 61.2, 59.3, 72.5, 42.3, 50.1, 68.9, 44.1, 68.6, 49.6, 44.4, 61.9, 65.8, 63.5, 37.8, 68.9, 52.3, 74.1, 66, 38, 68.2, 55.7, 62.1, 57.8, 29.5, 76.6, 51.3, 70.3, 59.6, 37, 65.2, 54, 51.2, 42.1, 46.2, 60.6, 56.3, 51.1, 42.5, 38.5, 51.6, 50.4, 59.1, 33.4, 49.2, 65.7, 64.7, 57, 38.7, 49.7, 60.2, 54.7, 43.3, 63.7, 70, 58.3, 39.5, 48.5, 68.6, 68.5, 57.6, 47, 40.3, 70.8, 77.4, 59.9, 48.8, 43.5, 63, 65.1, 54.9, 40),
group=as.factor(c('a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h'))
)
model1 <- nlme::gls(y ~ x, data = myData)
model2 <- glm(y ~ x, data = myData)
model3 <- glm(y ~ x * group, data = myData, family = "gaussian")
model4 <- MASS::glm.nb(y ~ x, data = myData)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
model5 <- nlme::lme(y ~ x, random = ~1|group, data = myData)
model6 <- nlme::lme(y ~ x, random = ~1 + x|group, data = myData)
# Print the coefficients of the model
coef(model3)
## (Intercept) x groupb groupc groupd groupe
## 4.0308 -0.0020 -2.9804 -4.5017 -5.7360 -9.2038
## groupf groupg grouph x:groupb x:groupc x:groupd
## -3.3564 -5.8536 -5.4391 0.0600 0.0408 0.0627
## x:groupe x:groupf x:groupg x:grouph
## 0.1475 0.0310 0.0679 0.0615
# Print the model output
model4
##
## Call: MASS::glm.nb(formula = y ~ x, data = myData, init.theta = 31010.139,
## link = log)
##
## Coefficients:
## (Intercept) x
## -0.6224 0.0265
##
## Degrees of Freedom: 159 Total (i.e. Null); 158 Residual
## Null Deviance: 168
## Residual Deviance: 131 AIC: 555
# Print the model formula
model5$call
## lme.formula(fixed = y ~ x, data = myData, random = ~1 | group)
model1 <- nlme::gls(y ~ x, data = myData)
model2 <- glm(y ~ x, data = myData)
model3 <- glm(y ~ x * group, data = myData, family = "gaussian")
model4 <- nlme::lme(y ~ x, random = ~1|group, data = myData, method = "REML")
model5 <- nlme::lme(y ~ x, random = ~1|group, data = myData, method = "ML")
# Apply a likelihood ratio test
anova(model4, model1)
## Model df AIC BIC logLik Test L.Ratio p-value
## model4 1 4 552 564 -272
## model1 2 3 596 605 -295 1 vs 2 46 <.0001
# Correct the p-value
LR <- ((-295.0109) - (-272.0328)) * -2
(1 - pchisq(LR, 1)) * 0.5
## [1] 6.05e-12
# Generate data frame of residuals and fitted values
diag <- data.frame(residuals=model2$residuals, fitted=model2$fitted)
# Visualize residuals vs fitted values
ggplot(diag, aes(x=fitted, y=residuals)) +
geom_point()
model1 <- glm(y ~ x * group, data = myData, family = "gaussian")
model2 <- MASS::glm.nb(y ~ x * group, data = myData)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
model3 <- nlme::lme(y ~ x, random = ~1 + x|group, data = myData)
# Create a data frame where x is 35 and 40 at each group
pred_df <- expand.grid(x = c(35, 40), group = unique(myData$group))
# Generate predictions using model1
predict(model1, pred_df)
## 1 2 3 4 5 6 7 8 9 10
## 3.9610 3.9510 3.0816 3.3718 0.8874 1.0814 0.4178 0.7211 -0.0807 0.6468
## 11 12 13 14 15 16
## 1.6911 1.8363 0.4853 0.8150 0.6745 0.9720
# Generate predictions using model2
predict(model2, pred_df, type = "response")
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 3.961 3.951 3.266 3.477 0.997 1.122 0.741 0.891 0.922 1.180 1.720 1.837 0.692
## 14 15 16
## 0.868 0.933 1.091
# Generate predictions using model3
predict(model3, pred_df)
## a a b b c c d d e e f f g
## 2.746 2.922 2.920 3.218 0.803 1.027 0.497 0.796 0.618 1.172 1.505 1.694 0.574
## g h h
## 0.892 0.733 1.027
## attr(,"label")
## [1] "Predicted values"
Chapter 1 - Introduction to the Workflow
Background:
Counting Words:
unnest_tokens(input=text, output=word) %>% count(chapter, word) %>% cast_dtm(document=chapter, term=word, value=n) # created as a sparse matrix Displaying Frequencies with ggplot:
ggplot(aes(x=document, y=gamma)) + geom_col(aes(fill=as.factor(topic))) ggplot(aes(x=term, y=beta)) + geom_col(aes(fill=as.factor(topic)), position=position_dodge()) mutate(topic = as.factor(topic)) %>% ggplot(aes(x=term, y=beta)) + geom_col(aes(fill=topic), position=position_dodge()) + theme(axis.text.x = element_text(angle=90)) # labels rotated 90 degrees Example code includes:
word_topics <- matrix(data=c(0.0034, 0.0279, 0.0374, 0.0025, 0.0034, 0.0787, 0.0034, 0.0279, 0.0034, 0.0533, 0.1054, 0.0025, 0.0034, 0.0787, 0.0034, 0.1294, 0.0034, 0.1041, 0.0034, 0.0279, 0.0034, 0.0279, 0.0034, 0.0279, 0.0374, 0.0025, 0.0034, 0.0533, 0.0034, 0.0787, 0.0034, 0.1294, 0.0034, 0.0279, 0.0714, 0.0025, 0.0374, 0.0025, 0.0374, 0.0025, 0.0714, 0.0025, 0.1054, 0.0025, 0.1395, 0.0025, 0.0374, 0.0025, 0.0374, 0.0025, 0.0374, 0.0025, 0.0374, 0.0025, 0.0374, 0.0025, 0.0374, 0.0025, 0.0034, 0.0279, 0.0374, 0.0025, 0.0034, 0.0279, 0.0034, 0.0279, 0.0374, 0.0025),
nrow=2, ncol=34, byrow=FALSE,
dimnames=list(c(1, 2),
c('agreed', 'bad', 'bank', 'due', 'fines', 'loans', 'pay', 'the', 'to', 'are', 'face', 'if', 'late', 'off', 'will', 'you', 'your', 'a', 'downtown', 'in', 'new', 'opened', 'restaurant', 'is', 'just', 'on', 'street', 'that', 'there', 'warwick', 'for', 'how', 'need', 'want'
)
)
)
# Display the column names
colnames(word_topics)
## [1] "agreed" "bad" "bank" "due" "fines"
## [6] "loans" "pay" "the" "to" "are"
## [11] "face" "if" "late" "off" "will"
## [16] "you" "your" "a" "downtown" "in"
## [21] "new" "opened" "restaurant" "is" "just"
## [26] "on" "street" "that" "there" "warwick"
## [31] "for" "how" "need" "want"
# Display the probability
word_topics[1, "street"]
## [1] 0.0374
ch1 <- paste0('Two thousand five hundred and fifty-eight years ago a little fleet of galleys toiled painfully against the current up the long strait of the Hellespont, rowed across the broad Propontis, and came to anchor in the smooth waters of the first inlet which cuts into the European shore of the Bosphorus. There a long crescent-shaped creek, which after-ages were to know as the Golden Horn, strikes inland for seven miles, forming a quiet backwater from the rapid stream which runs outside. On the headland, enclosed between this inlet and the open sea, a few hundred colonists disembarked, and hastily secured themselves from the wild tribes of the inland, by running some rough sort of a stockade across the ground from beach to beach. Thus was founded the city of Byzantium. The settlers were Greeks of the Dorian race, natives of the thriving seaport-state of Megara, one of the most enterprising of all the cities of Hellas in the time of colonial and commercial expansion which was then at its height. Wherever a Greek prow had cut its way into unknown waters, there Megarian seamen were soon found following in its wake. ',
'One band of these venturesome traders pushed far to the West to plant colonies in Sicily, but the larger share of the attention of Megara was turned towards the sunrising, towards the mist-enshrouded entrance of the Black Sea and the fabulous lands that lay beyond. There, as legends told, was to be found the realm of the Golden Fleece, the Eldorado of the ancient world, where kings of untold wealth reigned over the tribes of Colchis: there dwelt, by the banks of the river Thermodon, the Amazons, the warlike women who had once vexed far-off Greece by their inroads: there, too, was to be found, if one could but struggle far enough up its northern shore, the land of the Hyperboreans, the blessed folk who dwell behind the North Wind and know nothing of storm and winter. To seek these fabled wonders the Greeks sailed ever North and East till they had come to the extreme limits of the sea. The riches of the Golden Fleece they did not find, nor the country of the Hyperboreans, nor the tribes of the Amazons; but they did discover many lands well worth the knowing, and grew rich on the profits which they drew from the metals of Colchis and the forests of Paphlagonia, from the rich corn lands by the banks of the Dnieper and Bug, and the fisheries of the Bosphorus and the Maeotic Lake. Presently the whole coastland of the sea, which the Greeks, on their first coming, called Axeinos--\"the Inhospitable\"--became fringed with trading settlements, and its name was changed to Euxeinos--\"the Hospitable\"--in recognition of its friendly ports. It was in a similar spirit that, two thousand years later, the seamen who led the next great impulse of exploration that rose in Europe, turned the name of the \"Cape of Storms\" into that of the \"Cape of Good Hope.\" The Megarians, almost more than any other Greeks, devoted their attention to the Euxine, and the foundation of Byzantium was but one of their many achievements. ',
'Already, seventeen years before Byzantium came into being, another band of Megarian colonists had established themselves at Chalcedon, on the opposite Asiatic shore of the Bosphorus. The settlers who were destined to found the greater city applied to the oracle of Delphi to give them advice as to the site of their new home, and Apollo, we are told, bade them \"build their town over against the city of the blind.\" They therefore pitched upon the headland by the Golden Horn, reasoning that the Chalcedonians were truly blind to have neglected the more eligible site on the Thracian shore, in order to found a colony on the far less inviting Bithynian side of the strait. Early Coin Of Byzantium. Late Coin Of Byzantium Showing Crescent And Star. From the first its situation marked out Byzantium as destined for a great future. Alike from the military and from the commercial point of view no city could have been better placed. Looking out from the easternmost headland of Thrace, with all Europe behind it and all Asia before, it was equally well suited to be the frontier fortress to defend the border of the one, or the basis of operations for an invasion from the other. ',
'As fortresses went in those early days it was almost impregnable--two sides protected by the water, the third by a strong wall not commanded by any neighbouring heights. In all its early history Byzantium never fell by storm: famine or treachery accounted for the few occasions on which it fell into the hands of an enemy. In its commercial aspect the place was even more favourably situated. It completely commanded the whole Black Sea trade: every vessel that went forth from Greece or Ionia to traffic with Scythia or Colchis, the lands by the Danube mouth or the shores of the Maeotic Lake, had to pass close under its walls, so that the prosperity of a hundred Hellenic towns on the Euxine was always at the mercy of the masters of Byzantium. The Greek loved short stages and frequent stoppages, and as a half-way house alone Byzantium would have been prosperous: but it had also a flourishing local trade of its own with the tribes of the neighbouring Thracian inland, and drew much profit from its fisheries: so much so that the city badge--its coat of arms as we should call it--comprised a tunny-fish as well as the famous ox whose form alluded to the legend of the naming of the Bosphorus. As an independent state Byzantium had a long and eventful history. ',
'For thirty years it was in the hands of the kings of Persia, but with that short exception it maintained its freedom during the first three hundred years that followed its foundation. Many stirring scenes took place beneath its walls: it was close to them that the great Darius threw across the Bosphorus his bridge of boats, which served as a model for the more famous structure on which his son Xerxes crossed the Hellespont. Fifteen years later, when Byzantium in common with all its neighbours made an ineffectual attempt to throw off the Persian yoke, in the rising called the \"Ionic Revolt,\" it was held for a time by the arch-rebel Histiaeus, who--as much to enrich himself as to pay his seamen--invented strait dues. He forced every ship passing up or down the Bosphorus to pay a heavy toll, and won no small unpopularity thereby for the cause of freedom which he professed to champion. Ere long Byzantium fell back again into the hands of Persia, but she was finally freed from the Oriental yoke seventeen years later, when the victorious Greeks, fresh from the triumph of Salamis and Mycale, sailed up to her walls and after a long leaguer starved out the obstinate garrison [B.C. 479]. ',
'The fleet wintered there, and it was at Byzantium that the first foundations of the naval empire of Athens were laid, when all the Greek states of Asia placed their ships at the disposal of the Athenian admirals Cimon and Aristeides. During the fifth century Byzantium twice declared war on Athens, now the mistress of the seas, and on each occasion fell into the hands of the enemy--once by voluntary surrender in 439 B.C., once by treachery from within, in 408 B.C. But the Athenians, except in one or two disgraceful cases, did not deal hardly with their conquered enemies, and the Byzantines escaped anything harder than the payment of a heavy war indemnity. In a few years their commercial gains repaired all the losses of war, and the state was itself again. We know comparatively little about the internal history of these early centuries of the life of Byzantium. Some odd fragments of information survive here and there: we know, for example, that they used iron instead of copper for small money, a peculiarity shared by no other ancient state save Sparta. Their alphabet rejoiced in an abnormally shaped {~GREEK CAPITAL LETTER BETA~}, which puzzled all other Greeks, for it resembled a {~GREEK CAPITAL LETTER PI~} with an extra limb. The chief gods of the city were those that we might have expected--Poseidon the ruler of the sea, whose blessing gave Byzantium its chief wealth; and Demeter, the goddess who presided over the Thracian and Scythian corn lands which formed its second source of prosperity. ',
'The Byzantines were, if ancient chroniclers tell us the truth, a luxurious as well as a busy race: they spent too much time in their numerous inns, where the excellent wines of Maronea and other neighbouring places offered great temptations. They were gluttons too as well as tipplers: on one occasion, we are assured, the whole civic militia struck work in the height of a siege, till their commander consented to allow restaurants to be erected at convenient distances round the ramparts. One comic writer informs us that the Byzantines were eating young tunny-fish--their favourite dish--so constantly, that their whole bodies had become well-nigh gelatinous, and it was thought they might melt if exposed to too great heat! Probably these tales are the scandals of neighbours who envied Byzantine prosperity, for it is at any rate certain that the city showed all through its history great energy and love of independence, and never shrank from war as we should have expected a nation of epicures to do. It was not till the rise of Philip of Macedon and his greater son Alexander that Byzantium fell for the fifth time into the hands of an enemy. The elder king was repulsed from the citys walls after a long siege, culminating in an attempt at an escalade by night, which was frustrated owing to the sudden appearance of a light in heaven, which revealed the advancing enemy and was taken by the Byzantines as a token of special divine aid [B.C. 339]. In commemoration of it they assumed as one of their civic badges the blazing crescent and star, which has descended to our own days and is still used as an emblem by the present owners of the city--the Ottoman Sultans. ',
'But after repulsing Philip the Byzantines had to submit some years later to Alexander. They formed under him part of the enormous Macedonian empire, and passed on his decease through the hands of his successors--Demetrius Poliorcetes, and Lysimachus. After the death of the latter in battle, however, they recovered a precarious freedom, and were again an independent community for a hundred years, till the power of Rome invaded the regions of Thrace and the Hellespont. Byzantium was one of the cities which took the wise course of making an early alliance with the Romans, and obtained good and easy terms in consequence. During the wars of Rome with Macedon and Antiochus the Great it proved such a faithful assistant that the Senate gave it the status of a _civitas libera et foederata_, \"a free and confederate city,\" and it was not taken under direct Roman government, but allowed complete liberty in everything save the control of its foreign relations and the payment of a tribute to Rome. It was not till the Roman Republic had long passed away, that the Emperor Vespasian stripped it of these privileges, and threw it into the province of Thrace, to exist for the future as an ordinary provincial town [A.D. 73]. ',
'Though deprived of a liberty which had for long years been almost nominal, Byzantium could not be deprived of its unrivalled position for commerce. It continued to flourish under the _Pax Romana_, the long-continued peace which all the inner countries of the empire enjoyed during the first two centuries of the imperial _regime_, and is mentioned again and again as one of the most important cities of the middle regions of the Roman world. But an evil time for Byzantium, as for all the other parts of the civilized world, began when the golden age of the Antonines ceased, and the epoch of the military emperors followed. In 192 A.D., Commodus, the unworthy son of the great and good Marcus Aurelius, was murdered, and ere long three military usurpers were wrangling for his blood-stained diadem. Most unhappily for itself Byzantium lay on the line of division between the eastern provinces, where Pescennius Niger had been proclaimed, and the Illyrian provinces, where Severus had assumed the imperial style. The city was seized by the army of Syria, and strengthened in haste. Presently Severus appeared from the west, after he had made himself master of Rome and Italy, and fell upon the forces of his rival Pescennius. Victory followed the arms of the Illyrian legions, the east was subdued, and the Syrian emperor put to death. But when all his other adherents had yielded, the garrison of Byzantium refused to submit. ',
'For more than two years they maintained the impregnable city against the lieutenants of Severus, and it was not till A.D. 196 that they were forced to yield. The emperor appeared in person to punish the long-protracted resistance of the town; not only the garrison, but the civil magistrates of Byzantium were slain before his eyes. The massive walls \"so firmly built with great square stones clamped together with bolts of iron, that the whole seemed but one block,\" were laboriously cast down. The property of the citizens was confiscated, and the town itself deprived of all municipal privileges and handed over to be governed like a dependent village by its neighbours of Perinthus. Caracalla, the son of Severus, gave back to the Byzantines the right to govern themselves, but the town had received a hard blow, and would have required a long spell of peace to recover its prosperity. Peace however it was not destined to see. All through the middle years of the third century it was vexed by the incursions of the Goths, who harried mercilessly the countries on the Black Sea whose commerce sustained its trade. Under Gallienus in A.D. 263 it was again seized by an usurping emperor, and shared the fate of his adherents. The soldiers of Gallienus sacked Byzantium from cellar to garret, and made such a slaughter of its inhabitants that it is said that the old Megarian race who had so long possessed it were absolutely exterminated. ',
'But the irresistible attraction of the site was too great to allow its ruins to remain desolate. Within ten years after its sack by the army of Gallienus, we find Byzantium again a populous town, and its inhabitants are specially praised by the historian Trebellius Pollio for the courage with which they repelled a Gothic raid in the reign of Claudius II. The strong Illyrian emperors, who staved off from the Roman Empire the ruin which appeared about to overwhelm it in the third quarter of the third century, gave Byzantium time and peace to recover its ancient prosperity. It profited especially from the constant neighbourhood of the imperial court, after Diocletian fixed his residence at Nicomedia, only sixty miles away, on the Bithynian side of the Propontis. But the military importance of Byzantium was always interfering with its commercial greatness. After the abdication of Diocletian the empire was for twenty years vexed by constant partitions of territory between the colleagues whom he left behind him. Byzantium after a while found itself the border fortress of Licinius, the emperor who ruled in the Balkan Peninsula, while Maximinus Daza was governing the Asiatic provinces. ',
'While Licinius was absent in Italy, Maximinus treacherously attacked his rivals dominions without declaration of war, and took Byzantium by surprise. But the Illyrian emperor returned in haste, defeated his grasping neighbour not far from the walls of the city, and recovered his great frontier fortress after it had been only a few months out of his hands [A.D. 314]. The town must have suffered severely by changing masters twice in the same year; it does not, however, seem to have been sacked or burnt, as was so often the case with a captured city in those dismal days. But Licinius when he had recovered the place set to work to render it impregnable. Though it was not his capital he made it the chief fortress of his realm, which, since the defeat of Maximinus, embraced the whole eastern half of the Roman world. It was accordingly at Byzantium that Licinius made his last desperate stand, when in A.D. 323 he found himself engaged in an unsuccessful war with his brother-in-law Constantine, the Emperor of the West. For many months the war stood still beneath the walls of the city; but Constantine persevered in the siege, raising great mounds which overlooked the walls, and sweeping away the defenders by a constant stream of missiles, launched from dozens of military engines which he had erected on these artificial heights. At last the city surrendered, and the cause of Licinius was lost. Constantine, the last of his rivals subdued, became the sole emperor of the Roman world, and stood a victor on the ramparts which were ever afterwards to bear his name.')
ch2 <- paste0('When the fall of Byzantium had wrecked the fortunes of Licinius, the Roman world was again united beneath the sceptre of a single master. For thirty-seven years, ever since Diocletian parcelled out the provinces with his colleagues, unity had been unknown, and emperors, whose number had sometimes risen to six and sometimes sunk to two, had administered their realms on different principles and with varying success. Constantine, whose victory over his rivals had been secured by his talents as an administrator and a diplomatist no less than by his military skill, was one of those men whose hard practical ability has stamped upon the history of the world a much deeper impress than has been left by many conquerors and legislators of infinitely greater genius. He was a man of that self-contained, self-reliant, unsympathetic type of mind which we recognize in his great predecessor Augustus, or in Frederic the Great of Prussia. Constantine the Great Though the strain of old Roman blood in his veins must have been but small, Constantine was in many ways a typical Roman; the hard, cold, steady, unwearying energy, which in earlier centuries had won the empire of the world, was once more incarnate in him. ',
'But if Roman in character, he was anything but Roman in his sympathies. Born by the Danube, reared in the courts and camps of Asia and Gaul, he was absolutely free from any of that superstitious reverence for the ancient glories of the city on the Tiber which had inspired so many of his predecessors. Italy was to him but a secondary province amongst his wide realms. When he distributed his dominions among his heirs, it was Gaul that he gave as the noblest share to his eldest and best-loved son: Italy was to him a younger childs portion. There had been emperors before him who had neglected Rome: the barbarian Maximinus I. had dwelt by the Rhine and the Danube; the politic Diocletian had chosen Nicomedia as his favourite residence. But no one had yet dreamed of raising up a rival to the mistress of the world, and of turning Rome into a provincial town. If preceding emperors had dwelt far afield, it was to meet the exigencies of war on the frontiers or the government of distant provinces. ',
'It was reserved for Constantine to erect over against Rome a rival metropolis for the civilized world, an imperial city which was to be neither a mere camp nor a mere court, but the administrative and commercial centre of the Roman world. For more than a hundred years Rome had been a most inconvenient residence for the emperors. The main problem which had been before them was the repelling of incessant barbarian inroads on the Balkan Peninsula; the troubles on the Rhine and the Euphrates, though real enough, had been but minor evils. Rome, placed half way down the long projection of Italy, handicapped by its bad harbours and separated from the rest of the empire by the passes of the Alps, was too far away from the points where the emperor was most wanted--the banks of the Danube and the walls of Sirmium and Singidunum. For the ever-recurring wars with Persia it was even more inconvenient; but these were less pressing dangers; no Persian army had yet penetrated beyond Antioch--only 200 miles from the frontier--while in the Balkan Peninsula the Goths had broken so far into the heart of the empire as to sack Athens and Thessalonica. ',
'Constantine, with all the Roman world at his feet, and all its responsibilities weighing on his mind, was far too able a man to overlook the great need of the day--a more conveniently placed administrative and military centre for his empire. He required a place that should be easily accessible by land and sea--which Rome had never been in spite of its wonderful roads--that should overlook the Danube lands, without being too far away from the East; that should be so strongly situated that it might prove an impregnable arsenal and citadel against barbarian attacks from the north; that should at the same time be far enough away from the turmoil of the actual frontier to afford a safe and splendid residence for the imperial court. The names of several towns are given by historians as having suggested themselves to Constantine. First was his own birth-place--Naissus (Nisch) on the Morava, in the heart of the Balkan Peninsula; but Naissus had little to recommend it: it was too close to the frontier and too far from the sea. Sardica--the modern Sofia in Bulgaria--was liable to the same objections, and had not the sole advantage of Naissus, that of being connected in sentiment with the emperors early days. Nicomedia on its long gulf at the east end of the Propontis was a more eligible situation in every way, and had already served as an imperial residence. ',
'But all that could be urged in favour of Nicomedia applied with double force to Byzantium, and, in addition, Constantine had no wish to choose a city in which his own memory would be eclipsed by that of his predecessor Diocletian, and whose name was associated by the Christians, the class of his subjects whom he had most favoured of late, with the persecutions of Diocletian and Galerius. For Ilium, the last place on which Constantine had cast his mind, nothing could be alleged except its ancient legendary glories, and the fact that the mythologists of Rome had always fabled that their city drew its origin from the exiled Trojans of AEneas. Though close to the sea it had no good harbour, and it was just too far from the mouth of the Hellespont to command effectually the exit of the Euxine. Byzantium, on the other hand, was thoroughly well known to Constantine. For months his camp had been pitched beneath its walls; he must have known accurately every inch of its environs, and none of its military advantages can have missed his eye. Nothing, then, could have been more natural than his selection of the old Megarian city for his new capital. Yet the Roman world was startled at the first news of his choice; Byzantium had been so long known merely as a great port of call for the Euxine trade, and as a first-class provincial fortress, that it was hard to conceive of it as a destined seat of empire. When once Constantine had determined to make Byzantium his capital, in preference to any other place in the Balkan lands, his measures were taken with his usual energy and thoroughness. The limits of the new city were at once marked out by solemn processions in the old Roman style. ',
'In later ages a picturesque legend was told to account for the magnificent scale on which it was planned. The emperor, we read, marched out on foot, followed by all his court, and traced with his spear the line where the new fortifications were to be drawn. As he paced on further and further westward along the shore of the Golden Horn, till he was more than two miles away from his starting-point, the gate of old Byzantium, his attendants grew more and more surprised at the vastness of his scheme. At last they ventured to observe that he had already exceeded the most ample limits that an imperial city could require. But Constantine turned to rebuke them: \"I shall go on,\" he said, \"until He, the invisible guide who marches before me, thinks fit to stop.\" Guided by his mysterious presentiment of greatness, the emperor advanced till he was three miles from the eastern angle of Byzantium, and only turned his steps when he had included in his boundary line all the seven hills which are embraced in the peninsula between the Propontis and the Golden Horn. ', 'The rising ground just outside the walls of the old city, where Constantines tent had been pitched during the siege of A.D. 323, was selected out as the market-place of the new foundation. There he erected the _Milion_, or \"golden milestone,\" from which all the distances of the eastern world were in future to be measured. This \"central point of the world\" was not a mere single stone, but a small building like a temple, its roof supported by seven pillars; within was placed the statue of the emperor, together with that of his venerated mother, the Christian Empress Helena. The south-eastern part of the old town of Byzantium was chosen by Constantine for the site of his imperial palace. The spot was cleared of all private dwellings for a space of 150 acres, to give space not only for a magnificent residence for his whole court, but for spacious gardens and pleasure-grounds. A wall, commencing at the Lighthouse, where the Bosphorus joins the Propontis, turned inland and swept along parallel to the shore for about a mile, in order to shut off the imperial precinct from the city. ',
'The Heart of Constantinople North-west of the palace lay the central open space in which the life of Constantinople was to find its centre. This was the \"Augustaeum,\" a splendid oblong forum, about a thousand feet long by three hundred broad. It was paved with marble and surrounded on all sides by stately public buildings. To its east, as we have already said, lay the imperial palace, but between the palace and the open space were three detached edifices connected by a colonnade. Of these, the most easterly was the Great Baths, known, from their builder, as the \"Baths of Zeuxippus.\" They were built on the same magnificent scale which the earlier emperors had used in Old Rome, though they could not, perhaps, vie in size with the enormous Baths of Caracalla. Constantine utilized and enlarged the old public bath of Byzantium, which had been rebuilt after the taking of the city by Severus. ',
'He adorned the frontage and courts of the edifice with statues taken from every prominent town of Greece and Asia, the old Hellenic masterpieces which had escaped the rapacious hands of twelve generations of plundering proconsuls and Caesars. There were to be seen the Athene of Lyndus, the Amphithrite of Rhodes, the Pan which had been consecrated by the Greeks after the defeat of Xerxes, and the Zeus of Dodona. Adjoining the Baths, to the north, lay the second great building, on the east side of the Augustaeum--the Senate House. Constantine had determined to endow his new city with a senate modelled on that of Old Rome, and had indeed persuaded many old senatorial families to migrate eastward by judicious gifts of pensions and houses. We know that the assembly was worthily housed, but no details survive about Constantines building, on account of its having been twice destroyed within the century. But, like the Baths of Zeuxippus, it was adorned with ancient statuary, among which the Nine Muses of Helicon are specially cited by the historian who describes the burning of the place in A.D. 404. Linked to the Senate House by a colonnade, lay on the north the Palace of the Patriarch, as the Bishop of Byzantium was ere long to be called, when raised to the same status as his brethren of Antioch and Alexandria. ',
'A fine building in itself, with a spacious hall of audience and a garden, the patriarchal dwelling was yet completely overshadowed by the imperial palace which rose behind it. And so it was with the patriarch himself: he lived too near his royal master to be able to gain any independent authority. Physically and morally alike he was too much overlooked by his august neighbour, and never found the least opportunity of setting up an independent spiritual authority over against the civil government, or of founding an _imperium in imperio_ like the Bishop of Rome. The Atmeidan Hippodrome And St. Sophia. All along the western side of the Augustaeum, facing the three buildings which we have already described, lay an edifice which played a very prominent part in the public life of Constantinople. This was the great Hippodrome, a splendid circus 640 cubits long and 160 broad, in which were renewed the games that Old Rome had known so well. The whole system the chariot-races between the teams that represented the \"factions\" of the Circus was reproduced at Byzantium with an energy that even surpassed the devotion of the Romans to horse racing. ',
'From the first foundation of the city the rivalry of the \"Blues\" and the \"Greens\" was one of the most striking features of the life of the place. It was carried far beyond the circus, and spread into all branches of life. We often hear of the \"Green\" faction identifying itself with Arianism, or of the \"Blue\" supporting a pretender to the throne. Not merely men of sporting interests, but persons of all ranks and professions, chose their colour and backed their faction. The system was a positive danger to the public peace, and constantly led to riots, culminating in the great sedition of A.D. 523, which we shall presently have to describe at length. In the Hippodrome the \"Greens\" always entered by the north-eastern gate, and sat on the east side; the \"Blues\" approached by the north-western gate and stretched along the western side. The emperors box, called the Kathisma, occupied the whole of the short northern side, and contained many hundreds of seats for the imperial retinue. The great central throne of the Kathisma was the place in which the monarch showed himself most frequently to his subjects, and around it many strange scenes were enacted. It was on this throne that the rebel Hypatius was crowned emperor by the mob, with his own wifes necklace for an impromptu diadem. Here also, two centuries later, the Emperor Justinian II. sat in state after his reconquest of Constantinople, with his rivals, Leontius and Apsimarus, bound beneath his footstool, while the populace chanted, in allusion to the names of the vanquished princes, the verse, \"Thou shalt trample on the Lion and the Asp.\" Down the centre of the Hippodrome ran the \"spina,\" or division wall, which every circus showed; it was ornamented with three most curious monuments, whose strange juxtaposition seemed almost to typify the heterogeneous materials from which the new city was built up. ',
'The first and oldest was an obelisk brought from Egypt, and covered with the usual hieroglyphic inscriptions; the second was the most notable, though one of the least beautiful, of the antiquities of Constantinople: it was the three-headed brazen serpent which Pausanias and the victorious Greeks had dedicated at Delphi in 479 B.C., after they had destroyed the Persian army at Plataea. The golden tripod, which was supported by the heads of the serpents, had long been wanting: the sacrilegious Phocians had stolen it six centuries before; but the dedicatory inscriptions engraved on the coils of the pedestal survived then and survive now to delight the archaeologist. The third monument on the \"spina\" was a square bronze column of more modern work, contrasting strangely with the venerable antiquity of its neighbours. By some freak of chance all three monuments have remained till our own day: the vast walls of the Hippodrome have crumbled away, but its central decorations still stand erect in the midst of an open space which the Turks call the Atmeidan, or place of horses, in dim memory of its ancient use. Along the outer eastern wall of the Hippodrome on the western edge of the Augustaeum, stood a range of small chapels and statues, the most important landmark among them being the _Milion_ or central milestone of the empire, which we have already described. ',
'The statues, few at first, were increased by later emperors, till they extended along the whole length of the forum. Constantines own contribution to the collection was a tall porphyry column surmounted by a bronze image which had once been the tutelary Apollo of the city of Hierapolis, but was turned into a representation of the emperor by the easy method of knocking off its head and substituting the imperial features. It was exactly the reverse of a change which can be seen at Rome, where the popes have removed the head of the Emperor Aurelius, and turned him into St. Peter, on the column in the Corso. Building A Palace (from a Byzantine MS.) North of the Hippodrome stood the great church which Constantine erected for his Christian subjects, and dedicated to the Divine Wisdom (_Hagia Sophia_). It was not the famous domed edifice which now bears that name, but an earlier and humbler building, probably of the Basilica-shape then usual. Burnt down once in the fifth and once in the sixth centuries, it has left no trace of its original character. From the west door of St. Sophia a wooden gallery, supported on arches, crossed the square, and finally ended at the \"Royal Gate\" of the palace. By this the emperor would betake himself to divine service without having to cross the street of the Chalcoprateia (brass market), which lay opposite to St. Sophia. The general effect of the gallery must have been somewhat like that of the curious passage perched aloft on arches which connects the Pitti and Uffizi palaces at Florence. The edifices which we have described formed the heart of Constantinople. Between the Palace, the Hippodrome, and the Cathedral most of the important events in the history of the city took place. But to north and west the city extended for miles, and everywhere there were buildings of note, though no other cluster could vie with that round the Augustaeum. The Church of the Holy Apostles, which Constantine destined as the burying-place of his family, was the second among the ecclesiastical edifices of the town. Of the outlying civil buildings, the public granaries along the quays, the Golden Gate, by which the great road from the west entered the walls, and the palace of the praetorian praefect, who acted as governor of the city, must all have been well worthy of notice. ',
'A statue of Constantine on horseback, which stood by the last-named edifice, was one of the chief shows of Constantinople down to the end of the Middle Ages, and some curious legends gathered around it. Fifteenth-Century Drawing Of The Equestrian Statue Of Constantine. It was in A.D. 328 or 329--the exact date is not easily to be fixed--that Constantine had definitely chosen Byzantium for his capital, and drawn out the plan for its development. As early as May 11, 330, the buildings were so far advanced that he was able to hold the festival which celebrated its consecration. Christian bishops blessed the partially completed palace, and held the first service in St. Sophia; for Constantine, though still unbaptized himself, had determined that the new city should be Christian from the first. Of paganism there was no trace in it, save a few of the old temples of the Byzantines, spared when the older streets were levelled to clear the ground for the palace and adjoining buildings. The statues of the gods which adorned the Baths and Senate House stood there as works of art, not as objects of worship. To fill the vast limits of his city, Constantine invited many senators of Old Rome and many rich provincial proprietors of Greece and Asia to take up their abode in it, granting them places in his new senate and sites for the dwellings they would require. The countless officers and functionaries of the imperial court, with their subordinates and slaves, must have composed a very considerable element in the new population. The artizans and handicraftsmen were enticed in thousands by the offer of special privileges. Merchants and seamen had always abounded at Byzantium, and now flocked in numbers which made the old commercial prosperity of the city seem insignificant. Most effective--though most demoralizing--of the gifts which Constantine bestowed on the new capital to attract immigrants was the old Roman privilege of free distribution of corn to the populace. The wheat-tribute of Egypt, which had previously formed part of the public provision of Rome, was transferred to the use of Constantinople, only the African corn from Carthage being for the future assigned for the subsistence of the older city. On the completion of the dedication festival in 330 A.D. an imperial edict gave the city the title of New Rome, and the record was placed on a marble tablet near the equestrian statue of the emperor, opposite the Strategion. But \"New Rome\" was a phrase destined to subsist in poetry and rhetoric alone: the world from the first very rightly gave the city the founders name only, and persisted in calling it Constantinople. ')
chapters <- tibble::tibble(chapter=1:2, text=c(ch1, ch2))
str(chapters)
## Classes 'tbl_df', 'tbl' and 'data.frame': 2 obs. of 2 variables:
## $ chapter: int 1 2
## $ text : chr "Two thousand five hundred and fifty-eight years ago a little fleet of galleys toiled painfully against the curr"| __truncated__ "When the fall of Byzantium had wrecked the fortunes of Licinius, the Roman world was again united beneath the s"| __truncated__
# Specify the input column
word_freq <- chapters %>%
tidytext::unnest_tokens(output=word, input=text, token="words", format="text") %>%
# Obtain word frequencies
count(chapter, word)
# Test equality
word_freq %>%
filter(word == "after")
## # A tibble: 2 x 3
## chapter word n
## <int> <chr> <int>
## 1 1 after 11
## 2 2 after 4
corpus <- data.frame(text=c('Due to bad loans, the bank agreed to pay the fines', 'If you are late to pay off your loans to the bank, you will face fines', 'A new restaurant opened in downtown', 'There is a new restaurant that just opened on Warwick street', 'How will you pay off the loans you will need for the restaurant you want opened?'),
id=paste0("id_", 1:5),
stringsAsFactors = FALSE
)
corpus
## text
## 1 Due to bad loans, the bank agreed to pay the fines
## 2 If you are late to pay off your loans to the bank, you will face fines
## 3 A new restaurant opened in downtown
## 4 There is a new restaurant that just opened on Warwick street
## 5 How will you pay off the loans you will need for the restaurant you want opened?
## id
## 1 id_1
## 2 id_2
## 3 id_3
## 4 id_4
## 5 id_5
# The call to posterior(mod)$topics returns the probabilities of topics.
dtm <- corpus %>%
# Specify the input column
tidytext::unnest_tokens(input=text, output=word, drop=TRUE) %>%
count(id, word) %>%
# Specify the token
tidytext::cast_dtm(document=id, term=word, value=n)
mod = topicmodels::LDA(x=dtm, k=2, method="Gibbs", control=list(alpha=1, delta=0.1, seed=10005))
modeltools::posterior(mod)$topics
## 1 2
## id_1 0.615 0.385
## id_2 0.444 0.556
## id_3 0.250 0.750
## id_4 0.615 0.385
## id_5 0.111 0.889
# Generate the document-term matrix
dtm <- corpus %>%
tidytext::unnest_tokens(input=text, output=word) %>%
count(id, word) %>%
tidytext::cast_dtm(document=id, term=word, value=n)
# Run the LDA for two topics
mod <- topicmodels::LDA(x=dtm, k=2, method="Gibbs",control=list(alpha=1, delta=0.1, seed=10005))
# Retrieve the probabilities of word `will` belonging to topics 1 and 2
broom::tidy(mod, matrix="beta") %>%
filter(term == "will")
## # A tibble: 2 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 will 0.00379
## 2 2 will 0.0767
# Make a stacked column chart showing the probabilities of documents belonging to topics
broom::tidy(mod, matrix="gamma") %>%
mutate(topic = as.factor(topic)) %>%
ggplot(aes(x=document, y=gamma)) +
geom_col(aes(fill=topic))
Chapter 2 - Word Clouds, Stop Words, Control Arguments
Random Nature of LDA Algorithm:
Manipulating Vocabulary:
Word Clouds:
History of the Byzantine Empire:
Example code includes:
dtm <- dtm[, c("bank", "fines", "loans", "pay", "new", "opened", "restaurant")]
dtm
## <<DocumentTermMatrix (documents: 5, terms: 7)>>
## Non-/sparse entries: 18/17
## Sparsity : 49%
## Maximal term length: 10
## Weighting : term frequency (tf)
# Display column names
colnames(dtm)
## [1] "bank" "fines" "loans" "pay" "new"
## [6] "opened" "restaurant"
# Fit an LDA model for 2 topics using Gibbs sampling
mod <- topicmodels::LDA(x=dtm, k=2, method="Gibbs", control=list(alpha=1, seed=10005, thin=1))
# Convert matrix beta into tidy format and filter on topic number and term
broom::tidy(mod, matrix="beta") %>%
filter(topic==2, term=="opened")
## # A tibble: 1 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 2 opened 0.320
# The call to posterior(mod)$topics returns the probabilities of topics.
dtm <- corpus %>%
# Specify the input column
tidytext::unnest_tokens(input=text, output=word, drop=TRUE) %>%
count(id, word) %>%
# Specify the token
tidytext::cast_dtm(document=id, term=word, value=n)
# Fit LDA topic model using Gibbs sampling for 2 topics
mod1 <- topicmodels::LDA(x=dtm, k=2, method="Gibbs", control=list(alpha=1, seed=10005, thin=1))
# Display the probabilities of topics in documents side by side
broom::tidy(mod1, "gamma") %>% spread(topic, gamma)
## # A tibble: 5 x 3
## document `1` `2`
## <chr> <dbl> <dbl>
## 1 id_1 0.308 0.692
## 2 id_2 0.278 0.722
## 3 id_3 0.875 0.125
## 4 id_4 0.923 0.0769
## 5 id_5 0.389 0.611
# Fit LDA topic model using Gibbs sampling for 2 topics
mod2 <- topicmodels::LDA(x=dtm, k=2, method="Gibbs", control=list(alpha=25, seed=10005, thin=1))
# Display the probabilities of topics in documents side by side
broom::tidy(mod2, "gamma") %>% spread(topic, gamma)
## # A tibble: 5 x 3
## document `1` `2`
## <chr> <dbl> <dbl>
## 1 id_1 0.443 0.557
## 2 id_2 0.5 0.5
## 3 id_3 0.518 0.482
## 4 id_4 0.557 0.443
## 5 id_5 0.485 0.515
# Create the document-term matrix
dtm <- corpus %>%
tidytext::unnest_tokens(output=word, input=text) %>%
count(id, word) %>%
tidytext::cast_dtm(document=id, term=word, value=n)
# Display dtm as a matrix
as.matrix(dtm)
## Terms
## Docs agreed bad bank due fines loans pay the to are face if late off will you
## id_1 1 1 1 1 1 1 1 2 2 0 0 0 0 0 0 0
## id_2 0 0 1 0 1 1 1 1 2 1 1 1 1 1 1 2
## id_3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## id_4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## id_5 0 0 0 0 0 1 1 2 0 0 0 0 0 1 2 3
## Terms
## Docs your a downtown in new opened restaurant is just on street that there
## id_1 0 0 0 0 0 0 0 0 0 0 0 0 0
## id_2 1 0 0 0 0 0 0 0 0 0 0 0 0
## id_3 0 1 1 1 1 1 1 0 0 0 0 0 0
## id_4 0 1 0 0 1 1 1 1 1 1 1 1 1
## id_5 0 0 0 0 0 1 1 0 0 0 0 0 0
## Terms
## Docs warwick for how need want
## id_1 0 0 0 0 0
## id_2 0 0 0 0 0
## id_3 0 0 0 0 0
## id_4 1 0 0 0 0
## id_5 0 1 1 1 1
# Create the document-term matrix with stop words removed
dtm <- corpus %>%
tidytext::unnest_tokens(output=word, input=text) %>%
anti_join(tidytext::stop_words) %>%
count(id, word) %>%
tidytext::cast_dtm(document=id, term=word, value=n)
## Joining, by = "word"
# Display the matrix
as.matrix(dtm)
## Terms
## Docs agreed bad bank due fines loans pay late downtown restaurant street
## id_1 1 1 1 1 1 1 1 0 0 0 0
## id_2 0 0 1 0 1 1 1 1 0 0 0
## id_3 0 0 0 0 0 0 0 0 1 1 0
## id_4 0 0 0 0 0 0 0 0 0 1 1
## id_5 0 0 0 0 0 1 1 0 0 1 0
## Terms
## Docs warwick
## id_1 0
## id_2 0
## id_3 0
## id_4 1
## id_5 0
dictionary <- tibble::tibble(word=c("bank", "fines", "loans", "pay", "new", "opened", "restaurant"))
dictionary
## # A tibble: 7 x 1
## word
## <chr>
## 1 bank
## 2 fines
## 3 loans
## 4 pay
## 5 new
## 6 opened
## 7 restaurant
# Perform inner_join with the dictionary table
dtm <- corpus %>%
tidytext::unnest_tokens(output=word, input=text) %>%
inner_join(dictionary) %>%
count(id, word) %>%
tidytext::cast_dtm(document=id, term=word, value=n)
## Joining, by = "word"
# Display the summary of dtm
as.matrix(dtm)
## Terms
## Docs bank fines loans pay new opened restaurant
## id_1 1 1 1 1 0 0 0
## id_2 1 1 1 1 0 0 0
## id_3 0 0 0 0 1 1 1
## id_4 0 0 0 0 1 1 1
## id_5 0 0 1 1 0 1 1
# Generate the counts of words in the corpus
word_frequencies <- corpus %>%
tidytext::unnest_tokens(input=text, output=word) %>%
count(word)
# Create a wordcloud
wordcloud::wordcloud(words=word_frequencies$word, freq=word_frequencies$n, min.freq=1, max.words=10,
colors=c("DarkOrange", "Blue"), random.order=FALSE, random.color=FALSE
)
# DO NOT HAVE FILE 'history'
# Construct a document-term matrix
# dtm <- history %>%
# tidytext::unnest_tokens() %>%
# anti_join(stop_words) %>%
# count(chapter, word) %>%
# tidytext::cast_dtm(document=chapter, term=word, value=n)
#
# # Insert the missing arguments
# mod <- topicmodels::LDA(x=dtm, k=4, method="Gibbs", control=list(alpha=1, seed=10005))
#
# # Display top 15 words of each topic
# terms(mod, k=15)
#
#
# # Display the structure of the verbs dataframe
# str(verbs)
#
# # Construct a document-term matrix
# dtm <- history %>%
# tidytext::unnest_tokens() %>%
# inner_join(verbs, by=c("word"="past")) %>%
# count(chapter, word) %>%
# tidytext::cast_dtm(document=chapter, term=word, value=n)
#
# # Fit LDA for four topics
# mod <- topicmodels::LDA(x=dtm, k=4, method="Gibbs", control=list(alpha=1, seed=10005))
#
# # Display top 25 words from each topic
# terms(mod, k=25)
#
#
# # Extract matrix gamma and plot it
# broom::tidy(mod, "gamma") %>%
# mutate(document=as.numeric(document)) %>%
# ggplot(aes(x=document, y=gamma)) +
# geom_line(aes(color=factor(topic))) +
# labs(x="Chapter", y="Topic probability") +
# scale_color_manual(values=brewer.pal(n=4, "Set1"), name="Topic")
#
#
#
# # Display wordclouds one at a time
# for (j in 1:4) {
# # Generate a table with word frequences for topic j
# word_frequencies <- broom::tidy(mod, matrix="beta") %>%
# mutate(n = trunc(beta * 10000)) %>%
# filter(topic == j)
# # Display word cloud
# wordcloud::wordcloud(word = word_frequencies$term, freq = word_frequencies$n, max.words = 20,
# scale = c(3, 0.5), colors = c("DarkOrange", "CornflowerBlue", "DarkRed"),
# rot.per = 0.3
# )
# }
Chapter 3 - Named Entity Recognition as Unsupervised Classification
Using Topic Models as Classifiers:
From Word Windows to DTM:
Corpus Alignment and Classification:
Example code includes:
# The call to posterior(mod)$topics returns the probabilities of topics.
dtm <- corpus %>%
# Specify the input column
tidytext::unnest_tokens(input=text, output=word, drop=TRUE) %>%
count(id, word) %>%
# Specify the token
tidytext::cast_dtm(document=id, term=word, value=n)
# Fit a topic model using LDA with Gibbs sampling
mod = topicmodels::LDA(x=dtm, k=2, method="Gibbs", control=list(iter=500, thin=1, seed = 12345, alpha=NULL))
# Display topic prevalance in documents as a table
broom::tidy(mod, "gamma") %>% spread(topic, gamma)
## # A tibble: 5 x 3
## document `1` `2`
## <chr> <dbl> <dbl>
## 1 id_1 0.475 0.525
## 2 id_2 0.485 0.515
## 3 id_3 0.536 0.464
## 4 id_4 0.525 0.475
## 5 id_5 0.485 0.515
# Fit the model for delta = 0.1
mod <- topicmodels::LDA(x=dtm, k=2, method="Gibbs", control=list(iter=500, seed=12345, alpha=1, delta=0.1))
# Define which words we want to examine
my_terms = c("loans", "bank", "opened", "pay", "restaurant", "you")
# Make a tidy table
t <- broom::tidy(mod, "beta") %>%
filter(term %in% my_terms)
# Make a stacked column chart of word probabilities
ggplot(t, aes(x=term, y=beta)) +
geom_col(aes(fill=factor(topic))) +
theme(axis.text.x=element_text(angle=90))
# Fit the model for delta = 0.5
mod <- topicmodels::LDA(x=dtm, k=2, method="Gibbs", control=list(iter=500, seed=12345, alpha=1, delta=0.5))
# Define which words we want to examine
my_terms = c("loans", "bank", "opened", "pay", "restaurant", "you")
# Make a tidy table
t <- broom::tidy(mod, "beta") %>%
filter(term %in% my_terms)
# Make a stacked column chart
ggplot(t, aes(x=term, y=beta)) +
geom_col(aes(fill=factor(topic))) +
theme(axis.text.x=element_text(angle=90))
text <- c(ch1, ch2)
# Regex pattern for an entity and word context
p1 <- "( [a-z]+){2}( (St[.] )?[A-Z][a-z]+)+( [a-z]+){2}"
# Obtain the regex match object from gregexpr
m <- gregexpr(p1, text)
# Get the matches and flatten the list
v <- unlist(regmatches(text, m))
# Find the number of elements in the vector
length(v)
## [1] 116
# Regex pattern for an entity and word context
p2 <- "( [a-z]+){2}( (St[.] )?[A-Z][a-z]+( (of|the) [A-Z][a-z]+)?)+( [a-z]+){2}"
# Obtain the regex match object from gregexpr
m <- gregexpr(p2, text)
# Get the matches and flatten the list
v <- unlist(regmatches(text, m))
# Find the number of elements in the vector
length(v)
## [1] 121
entity_pattern <- "( (St[.] )?[A-Z][a-z]+( (of|the) [A-Z][a-z]+)?)+"
v <- c(' into the European shore of', ' settlers were Greeks of the', ' cities of Hellas in the', ' to the West to plant', ' attention of Megara was turned', ' of the Black Sea and the', ' behind the North Wind and know', ' wonders the Greeks sailed ever', ' of the Golden Fleece they did', ' metals of Colchis and the', ' of the Bosphorus and the', ' foundation of Byzantium was but', ' years before Byzantium came into', ' band of Megarian colonists had', ' the opposite Asiatic shore of', ' oracle of Delphi to give', ' that the Chalcedonians were truly', ' less inviting Bithynian side of', ' marked out Byzantium as destined', ' with all Europe behind it', ' early history Byzantium never fell', ' by the Danube mouth or', ' a hundred Hellenic towns on', ' house alone Byzantium would have', ' independent state Byzantium had a', ' the great Darius threw across', ' his son Xerxes crossed the', ' down the Bosphorus to pay', ' from the Oriental yoke seventeen', ' was at Byzantium that the', ' empire of Athens were laid', ' all the Greek states of', ' fifth century Byzantium twice declared', ' and the Byzantines escaped anything', ' blessing gave Byzantium its chief', ' wines of Maronea and other', ' that the Byzantines were eating', ' rise of Philip of Macedon and his', ' by the Byzantines as a', ' after repulsing Philip the Byzantines had to', ' power of Rome invaded the', ' regions of Thrace and the', ' that the Senate gave it', ' till the Roman Republic had long', ' that the Emperor Vespasian stripped it', ' for itself Byzantium lay on', ' and the Syrian emperor put', ' garrison of Byzantium refused to', ' magistrates of Byzantium were slain', ' to the Byzantines the right', ' on the Black Sea whose commerce', ' the old Megarian race who', ' we find Byzantium again a', ' the historian Trebellius Pollio for the', ' repelled a Gothic raid in', ' from the Roman Empire the ruin', ' on the Bithynian side of', ' importance of Byzantium was always', ' abdication of Diocletian the empire', ' and took Byzantium by surprise')
# Print out contents of the `entity_pattern`
entity_pattern
## [1] "( (St[.] )?[A-Z][a-z]+( (of|the) [A-Z][a-z]+)?)+"
# Remove the named entity from text
v2 <- gsub(entity_pattern, "", v)
# Display the head of v2
head(v2)
## [1] " into the shore of" " settlers were of the"
## [3] " cities of in the" " to the to plant"
## [5] " attention of was turned" " of the and the"
# Remove the named entity
v2 <- gsub(entity_pattern, "", v)
# Pattern for inserting suffixes
p <- "\\1_L1 \\2_L2 \\3_R1 \\4_R2"
# Add suffixes to words
context <- gsub("([a-z]+) ([a-z]+) ([a-z]+) ([a-z]+)", p, v2)
# Extract named entity and use it as document ID
doc_id <- unlist(regmatches(v, gregexpr(entity_pattern, v)))
# Make a data frame with columns doc_id and text
corpus <- data.frame(doc_id = doc_id, text = context, stringsAsFactors = F)
# Summarize the text to produce a document for each doc_id
corpus2 <- corpus %>%
group_by(doc_id) %>%
summarise(doc = paste(text, collapse=" "))
# Make a document-term matrix
dtm <- corpus2 %>%
tidytext::unnest_tokens(input=doc, output=word) %>%
count(doc_id, word) %>%
tidytext::cast_dtm(document=doc_id, term=word, value=n)
# Fit an LDA model for 3 topics
mod <- topicmodels::LDA(x=dtm, k=3, method="Gibbs", control=list(alpha=1, seed=12345, iter=1000, thin=1))
# Create a table with probabilities of topics in documents
topics <- broom::tidy(mod, matrix="gamma") %>%
spread(topic, gamma)
# Set random seed for reproducability
set.seed(12345)
# Take a sample of 20 random integers, without replacement
r <- sample.int(n=nrow(corpus2), size=20, replace=FALSE)
# Generate a document-term matrix
train_dtm <- corpus2[-r, ] %>%
tidytext::unnest_tokens(input=doc, output=word) %>%
count(doc_id, word) %>%
tidytext::cast_dtm(document=doc_id, term=word, value=n)
# Fit an LDA topic model for k=3
train_mod <- topicmodels::LDA(x=train_dtm, k=3, method="Gibbs",
control=list(alpha=1, seed=10001, iter=1000, thin=1)
)
# Get the test row indices
set.seed(12345)
r <- sample.int(n=nrow(corpus2), size=20, replace=FALSE)
# Extract the vocabulary of the training model
model_vocab <- broom::tidy(train_mod, matrix="beta") %>%
select(term) %>%
distinct()
# Create a table of counts with aligned vocabularies
test_table <- corpus2[r, ] %>%
tidytext::unnest_tokens(input=doc, output=word) %>%
count(doc_id, word) %>%
right_join(model_vocab, by=c("word"="term"))
# Prepare a document-term matrix
test_dtm <- test_table %>%
arrange(desc(doc_id)) %>%
mutate(doc_id = ifelse(is.na(doc_id), first(doc_id), doc_id), n = ifelse(is.na(n), 0, n)) %>%
tidytext::cast_dtm(document=doc_id, term=word, value=n)
# Obtain posterior probabilities for test documents
results <- modeltools::posterior(object=train_mod, newdata=test_dtm)
# Display the matrix with topic probabilities
results$topics
## 1 2 3
## Thrace 0.500 0.333 0.1667
## Syrian 0.600 0.200 0.2000
## Senate 0.400 0.200 0.4000
## Roman Republic 0.600 0.200 0.2000
## Roman Empire 0.500 0.250 0.2500
## Philip the Byzantines 0.400 0.400 0.2000
## Oriental 0.500 0.250 0.2500
## Megarian 0.200 0.400 0.4000
## Maronea 0.400 0.400 0.2000
## Hellenic 0.250 0.250 0.5000
## Golden Fleece 0.400 0.200 0.4000
## European 0.167 0.667 0.1667
## Emperor Vespasian 0.400 0.200 0.4000
## Diocletian 0.250 0.500 0.2500
## Danube 0.500 0.250 0.2500
## Colchis 0.500 0.333 0.1667
## Chalcedonians 0.400 0.400 0.2000
## Byzantines 0.750 0.167 0.0833
## Black Sea 0.750 0.125 0.1250
## Bithynian 0.167 0.667 0.1667
Chapter 4 - How Many Topics is Enough?
Finding the Best Number of Topics:
Topic Models Fitted to Novels:
unnest_tokens(input=text, output=word) %>% mutate(word_index = 1:n()) %>% mutate(doc_number = word_index %/% 1000 + 1) %>% count(doc_number, word) %>% cast_dtm(term=word, document=doc_number, value=n)Locking Topics With Seed Words:
Wrap Up:
Example code includes:
# DO NOT HAVE THE DATA FRAME df
# Split the Abstract column into tokens
dtm <- df %>%
tidytext::unnest_tokens(input=Abstract, output=word) %>%
# Remove stopwords
anti_join(stop_words) %>%
# Count the number of occurrences
count(AwardNumber, word) %>%
# Create a document term matrix
tidytext::cast_dtm(document=AwardNumber, term=word, value=n)
dtm <- df %>%
tidytext::unnest_tokens(input=Abstract, output=word) %>%
anti_join(stop_words) %>%
# Count occurences
count(AwardNumber, word) %>%
# Group the data
group_by(word) %>%
# Filter for document wide frequency
filter(sum(n) >= 10) %>%
# Ungroup the data andreate a document term matrix
ungroup() %>%
tidytext::cast_dtm(document=AwardNumber, term=word, value=n)
# Create a LDA model
mod <- topicmodels::LDA(x=dtm, method="Gibbs", k=3, control=list(alpha=0.5, seed=1234, iter=500, thin=1))
# Retrieve log-likelihood
topicmodels::logLik(mod)
# Find perplexity
topicmodels::perplexity(object=mod, newdata=dtm)
# Display names of elements in the list
names(models[[1]])
# Retrieve the values of k and perplexity, and plot perplexity vs k
x <- sapply(models, '[[', "k")
y <- sapply(models, '[[', "perplexity")
plot(x, y, xlab="number of clusters, k", ylab="perplexity score", type="o")
# Record the new perplexity scores
new_perplexity_score <- numeric(length(models))
# Run each model for 100 iterations
for (i in seq_along(models)) {
mod2 <- topicmodels::LDA(x=dtm, model=models[[i]]$model, control=list(iter=100, seed=12345, thin=1))
new_perplexity_score[i] <- topicmodels::perplexity(object=mod2, newdata=dtm)
}
# Specify the possible values of k and build the plot
k <- 2:10
plot(x=k, y=new_perplexity_score, xlab="number of clusers, k", ylab="perplexity score", type="o")
t <- history %>%
# Unnest the tokens
tidytext::unnest_tokens(input=text, output=word) %>%
# Create a word index column
mutate(word_index = row_number()) %>%
# Create a document number column
mutate(document_number = word_index %/% 1000 + 1)
dtm <- t %>%
# Join verbs on "word" and "past"
inner_join(verbs, by=c("word"="past")) %>%
# Count word
count(document_number, word) %>%
# Create a document-term matrix
tidytext::cast_dtm(document=document_number, term=word, value=n)
# Store the names of documents in a vector
required_documents <- c(" Africa", " Emperor Heraclius", " Adrianople", " Daniel", " African")
# Convert table into wide format
broom::tidy(mod, matrix="gamma") %>%
spread(key=topic, value=gamma) %>%
# Keep only the rows with document names matching the required documents
filter(document %in% required_documents)
# Set up the column names
colnames(seedwords) <- colnames(dtm)
# Set the weights
seedwords[1, "defeated_l2"] = 1
seedwords[2, "across_l2"] = 1
# Fit the topic model
mod <- topicmodels::LDA(dtm, k=3, method="Gibbs", seedwords=seedwords,
control=list(alpha=1, iter=500, seed=1234)
)
# Examine topic assignment in the fitted model
broom::tidy(mod, "gamma") %>% spread(topic, gamma) %>%
filter(document %in% c(" Daniel", " Adrianople", " African"))
Chapter 1 - Introduction and Review of plotly
Interactive and Dynamic Graphics:
Adding Aesthetics to Represent a Variable:
Moving Beyond Simple Interactivity:
Example code includes:
# load the plotly package
library(plotly)
acwiDate <- as.Date(c('2017-01-03', '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-09', '2017-01-10', '2017-01-11', '2017-01-12', '2017-01-13', '2017-01-17', '2017-01-18', '2017-01-19', '2017-01-20', '2017-01-23', '2017-01-24', '2017-01-25', '2017-01-26', '2017-01-27', '2017-01-30', '2017-01-31', '2017-02-01', '2017-02-02', '2017-02-03', '2017-02-06', '2017-02-07', '2017-02-08', '2017-02-09', '2017-02-10', '2017-02-13', '2017-02-14', '2017-02-15', '2017-02-16', '2017-02-17', '2017-02-21', '2017-02-22', '2017-02-23', '2017-02-24', '2017-02-27', '2017-02-28', '2017-03-01', '2017-03-02', '2017-03-03', '2017-03-06', '2017-03-07', '2017-03-08', '2017-03-09', '2017-03-10', '2017-03-13', '2017-03-14', '2017-03-15', '2017-03-16', '2017-03-17', '2017-03-20', '2017-03-21', '2017-03-22', '2017-03-23', '2017-03-24', '2017-03-27', '2017-03-28', '2017-03-29', '2017-03-30', '2017-03-31', '2017-04-03', '2017-04-04', '2017-04-05', '2017-04-06', '2017-04-07', '2017-04-10', '2017-04-11', '2017-04-12', '2017-04-13', '2017-04-17', '2017-04-18', '2017-04-19', '2017-04-20', '2017-04-21', '2017-04-24', '2017-04-25', '2017-04-26', '2017-04-27', '2017-04-28', '2017-05-01', '2017-05-02', '2017-05-03', '2017-05-04', '2017-05-05', '2017-05-08', '2017-05-09', '2017-05-10', '2017-05-11', '2017-05-12', '2017-05-15', '2017-05-16', '2017-05-17', '2017-05-18', '2017-05-19', '2017-05-22', '2017-05-23', '2017-05-24', '2017-05-25', '2017-05-26', '2017-05-30', '2017-05-31', '2017-06-01', '2017-06-02', '2017-06-05', '2017-06-06', '2017-06-07', '2017-06-08', '2017-06-09', '2017-06-12', '2017-06-13', '2017-06-14', '2017-06-15', '2017-06-16', '2017-06-19', '2017-06-20', '2017-06-21', '2017-06-22', '2017-06-23', '2017-06-26', '2017-06-27', '2017-06-28', '2017-06-29', '2017-06-30', '2017-07-03', '2017-07-05', '2017-07-06', '2017-07-07', '2017-07-10', '2017-07-11', '2017-07-12', '2017-07-13', '2017-07-14', '2017-07-17', '2017-07-18', '2017-07-19', '2017-07-20', '2017-07-21', '2017-07-24', '2017-07-25', '2017-07-26', '2017-07-27', '2017-07-28', '2017-07-31', '2017-08-01', '2017-08-02', '2017-08-03', '2017-08-04', '2017-08-07', '2017-08-08', '2017-08-09', '2017-08-10', '2017-08-11', '2017-08-14', '2017-08-15', '2017-08-16', '2017-08-17', '2017-08-18', '2017-08-21', '2017-08-22', '2017-08-23', '2017-08-24', '2017-08-25', '2017-08-28', '2017-08-29', '2017-08-30', '2017-08-31', '2017-09-01', '2017-09-05', '2017-09-06', '2017-09-07', '2017-09-08', '2017-09-11', '2017-09-12', '2017-09-13', '2017-09-14', '2017-09-15', '2017-09-18', '2017-09-19', '2017-09-20', '2017-09-21', '2017-09-22', '2017-09-25', '2017-09-26', '2017-09-27', '2017-09-28', '2017-09-29', '2017-10-02', '2017-10-03', '2017-10-04', '2017-10-05', '2017-10-06', '2017-10-09', '2017-10-10', '2017-10-11', '2017-10-12', '2017-10-13', '2017-10-16', '2017-10-17', '2017-10-18', '2017-10-19', '2017-10-20', '2017-10-23', '2017-10-24', '2017-10-25', '2017-10-26', '2017-10-27', '2017-10-30', '2017-10-31', '2017-11-01', '2017-11-02', '2017-11-03', '2017-11-06', '2017-11-07', '2017-11-08', '2017-11-09', '2017-11-10', '2017-11-13', '2017-11-14', '2017-11-15', '2017-11-16', '2017-11-17', '2017-11-20', '2017-11-21', '2017-11-22', '2017-11-24', '2017-11-27', '2017-11-28', '2017-11-29', '2017-11-30', '2017-12-01', '2017-12-04', '2017-12-05', '2017-12-06', '2017-12-07', '2017-12-08', '2017-12-11', '2017-12-12', '2017-12-13', '2017-12-14', '2017-12-15', '2017-12-18', '2017-12-19', '2017-12-20', '2017-12-21', '2017-12-22', '2017-12-26', '2017-12-27', '2017-12-28', '2017-12-29'))
acwiOpen <- c(59.61, 59.87, 60.15, 60.35, 60.22, 60.24, 60.25, 60.34, 60.54, 60.4, 60.47, 60.43, 60.43, 60.45, 60.49, 61.08, 61.41, 61.31, 60.89, 60.85, 61.13, 60.85, 61.26, 61.08, 61.12, 61.01, 61.3, 61.58, 61.95, 62, 62.08, 62.49, 62.19, 62.58, 62.61, 62.93, 62.32, 62.48, 62.47, 62.83, 63, 62.82, 62.75, 62.64, 62.63, 62.44, 62.78, 62.93, 62.74, 62.91, 63.65, 63.62, 63.58, 63.79, 62.83, 62.94, 63.09, 62.69, 63.07, 63.15, 63.42, 63.24, 63.31, 63.09, 63.43, 63.14, 63.07, 63.1, 63.13, 63.11, 62.88, 62.83, 62.75, 62.95, 62.96, 63.07, 63.94, 64.23, 64.34, 64.39, 64.34, 64.44, 64.6, 64.47, 64.52, 64.7, 64.95, 64.91, 64.84, 64.92, 64.95, 65.23, 65.58, 65.06, 64.41, 64.91, 65.36, 65.53, 65.48, 65.78, 65.78, 65.66, 65.91, 65.87, 66.44, 66.54, 66.3, 66.36, 66.4, 66.42, 66.06, 66.38, 66.71, 65.72, 66.16, 66.5, 65.86, 65.47, 65.46, 65.5, 65.93, 65.73, 65.62, 65.86, 65.6, 65.67, 65.54, 65.32, 65.09, 65.36, 65.48, 65.93, 66.24, 66.4, 66.74, 66.62, 66.88, 67.25, 67.02, 67.05, 67.22, 67.34, 67.59, 67.03, 67.35, 67.54, 67.79, 67.46, 67.56, 67.52, 67.53, 67.11, 67.03, 66.34, 66.8, 66.94, 67.14, 66.95, 66.27, 66.32, 66.63, 66.77, 67, 67.01, 67.11, 66.6, 66.97, 67.3, 67.73, 67.48, 67.4, 67.76, 67.72, 68.04, 68.46, 68.45, 68.3, 68.5, 68.74, 68.81, 68.88, 68.82, 68.69, 68.51, 68.37, 68.33, 68.26, 68.59, 68.77, 69, 69.17, 69.25, 69.21, 69.45, 69.57, 69.7, 69.73, 70, 70.03, 69.97, 70.14, 69.82, 70.19, 70.25, 70.02, 70.04, 69.87, 69.93, 70.06, 70.21, 70.63, 70.41, 70.48, 70.57, 70.73, 70.71, 70.39, 70.37, 70.02, 70.14, 69.7, 70.21, 70.33, 70.48, 70.89, 71.15, 71.35, 71.31, 71.32, 71.66, 71.67, 71.61, 71.88, 71.88, 71.01, 70.94, 71.58, 71.66, 71.79, 72.04, 72.07, 71.88, 72.54, 72.03, 71.98, 71.83, 71.97, 72.01, 72.04, 72.3, 72.39)
acwiVolume <- 1000 * c(2576.7, 1087.3, 1717.3, 1233.8, 1471.1, 1393.6, 1508.8, 1481.4, 2432.3, 2090, 2246.2, 2137.9, 1611.1, 1991.5, 1641.1, 2840.2, 2160.1, 1003.4, 4898.2, 2894.4, 3562.9, 1657.2, 1997.7, 1144.1, 1062.4, 1950.6, 1028, 1650, 2759.1, 2269.6, 2227.5, 2579.4, 2296.1, 2893.7, 2520.1, 3967.3, 1936.7, 1179.7, 1733, 2769.6, 1960.8, 1665.1, 930.8, 1061.5, 1612.4, 1679.7, 2343.2, 1273.7, 1780.8, 3030.7, 2072.3, 1595.4, 1362.9, 2535.6, 1795.2, 3562.7, 1503.5, 1775.4, 4208, 2474.4, 1574.5, 2063.4, 2034.7, 1977.5, 2895.8, 1652.6, 952.9, 1817.8, 1235.2, 1185.6, 1809.8, 2123.6, 1750.8, 976.5, 1681.4, 2036.3, 1944.4, 2231.6, 1360.9, 1897.3, 1511.7, 1268.5, 3350.2, 938.8, 703.2, 1007.2, 869.8, 1010.1, 932.1, 2738.8, 967.7, 1258.5, 676.4, 1467.8, 1974.6, 1207, 1498.8, 1682.5, 962.7, 1089.5, 1248.6, 1226.7, 1130.4, 3266.9, 3152.7, 1609.4, 1270.5, 2328.8, 1655.5, 1555.6, 3059.1, 850.7, 1955.1, 1871, 1534.7, 1774.6, 2155.1, 1116.5, 2049.5, 1096.9, 3644.7, 1694.3, 1859.1, 1690.4, 3787.3, 1465.5, 3957.6, 2196.1, 2179.2, 761.9, 1039.2, 1509.3, 1474.7, 774.2, 1197.8, 2261.4, 1640.3, 1669.5, 881, 1580.2, 1884.3, 4450.3, 1594.1, 1183.3, 2577.7, 2403.4, 1061.5, 623.1, 1141.4, 917.6, 859.4, 606.6, 1778.9, 1512.2, 1033, 871.8, 1123.2, 1594, 1434.6, 1029.3, 1013.7, 864.3, 4048.5, 989.2, 748.9, 1259.3, 781.2, 1045.1, 1675.7, 4411.3, 1557.2, 1050.2, 610.4, 961.4, 1144, 1079.9, 358.2, 969.8, 1256.6, 1515.8, 2005, 1264.2, 951.8, 4745.4, 1015.3, 801, 3610.4, 2570.3, 3559.1, 1349.5, 1151, 846.5, 1034.1, 625.7, 1088.8, 1111.9, 1388.6, 650.2, 2709.7, 1315.7, 2027.4, 1260.3, 1314.8, 876.1, 677.4, 920, 1039.3, 960.1, 771.1, 1247.4, 1493.2, 1420.1, 1157.6, 764.1, 1731.1, 536.9, 1439.9, 551.3, 789.8, 1108.1, 1198.4, 998.6, 591.1, 889.4, 872.3, 1325.8, 700.6, 1098.1, 1015.9, 1623.8, 7719.2, 6765.1, 2408.8, 2596.6, 1169.7, 1864.7, 1167.7, 5803.3, 1731, 1855.7, 2458.4, 1700.7, 2077.2, 1458.3, 2557.6, 2240.2, 1107.5, 4644.2, 3342.1, 1402.7, 1104.4)
acwi <- tibble::tibble(Date=acwiDate, Open=acwiOpen, Volume=acwiVolume)
str(acwi)
# Create a times series plot of Open against Date
acwi %>%
plot_ly(x = ~Date, y = ~Open) %>%
add_lines()
# Create a scatterplot with Date on the x-axis
# and Volume on the y-axis
acwi %>%
plot_ly(x = ~Date, y = ~Volume) %>%
add_markers()
happyLE <- c(52.3, 69.1, 65.7, 67.5, 65.1, 72.8, 72.4, 63.1, 66.1, 62.5, 66.6, 72.1, 51.8, 60.3, 68, 58, 65.5, 66.4, 52.4, 58.6, 50.1, 44.6, 46, 69.6, 69.3, 64.1, 55.3, 50.8, 69.9, 67, 72.8, 71.5, 71.7, 63.5, 67.3, 61.6, 64.3, 67.3, 56.7, 71.7, 72.6, 57.1, 64.3, 71.1, 54.9, 71.8, 63.4, 51.2, 53.3, 63.8, 76.5, 67.2, 72.8, 59.5, 60.6, 66, 61, 71.7, 71.9, 74.1, 47, 65.8, 75.3, 64.4, 64.2, 58.7, 62.4, 65.4, 62.9, 57.9, 65.1, 68.9, 52.8, 61.5, 67, 72.2, 65.9, 57, 54.2, 49.2, 71.8, 53.3, 65.6, 68.1, 63.7, 62.3, 67.1, 65.3, 50, 57.6, 55.5, 61.3, 71.6, 71.6, 66.3, 51.3, 45.9, 71.1, 57.5, 63.1, 68, 65.4, 60.3, 69.1, 72.2, 66.9, 63, 64.1, 58, 65.7, 44.4, 75.8, 68.8, 70.9, 55.2, 74, 50, 74.5, 65.3, 73, 73.2, 71.2, 63, 56.7, 66.4, 52.3, 61.7, 65.9, 65.8, 60.4, 51.8, 63.2, 68.6, 72.1, 69.8, 68.4, 63.2, 66.2, 55, 53.8, 52.7)
happyHappy <- c(2.66, 4.64, 5.25, 6.04, 4.29, 7.26, 7.29, 5.15, 6.23, 4.31, 5.55, 6.93, 4.85, 5.65, 5.09, 3.5, 6.33, 5.1, 4.65, 4.59, 5.07, 3.48, 4.56, 6.32, 5.1, 6.16, 4.88, 4.31, 7.23, 5.34, 6.06, 6.79, 7.59, 5.61, 5.84, 3.93, 6.34, 5.94, 4.18, 7.79, 6.64, 4.78, 4.45, 7.07, 5.48, 5.15, 6.33, 4.87, 3.82, 6.02, 5.36, 6.07, 7.48, 4.05, 5.1, 4.72, 4.46, 7.06, 7.33, 6.2, 5.04, 5.89, 5.91, 4.81, 5.88, 4.48, 6.15, 6.09, 5.63, 4.62, 5.98, 5.15, 4.42, 5.65, 6.27, 7.06, 5.23, 4.08, 3.42, 4.74, 6.68, 4.68, 6.17, 6.41, 5.33, 5.33, 5.61, 5.31, 4.28, 4.15, 4.44, 4.74, 7.46, 7.33, 6.48, 4.62, 5.32, 7.58, 5.83, 4.63, 6.57, 5.71, 5.59, 6.2, 5.71, 6.09, 5.58, 6.29, 4.68, 5.12, 4.09, 6.38, 6.37, 6.17, 4.51, 5.87, 2.82, 6.23, 4.33, 7.29, 7.47, 6.36, 5.83, 3.35, 5.94, 4.36, 6.19, 4.12, 5.61, 5.23, 4, 4.31, 7.04, 7.1, 6.99, 6.34, 6.42, 5.18, 3.25, 3.93, 3.64)
happyGDP <- c(7.46, 9.37, 9.54, 9.84, 9.03, 10.71, 10.72, 9.65, 10.69, 8.16, 9.72, 10.66, 7.63, 8.83, 9.37, 9.68, 9.55, 9.82, 7.43, 8.2, 8.13, NA, 7.49, 10.04, 9.64, 9.49, 8.56, 6.63, 9.67, 10.01, 10.36, 10.39, 10.75, 9.59, 9.22, 9.26, 9, 10.29, 7.44, 10.61, 10.56, 9.72, 9.17, 10.71, 8.33, 10.12, 8.92, 7.53, 7.4, 8.4, 10.92, 10.19, 10.76, 8.77, 9.32, 9.85, 9.62, 11.07, 10.41, 10.47, 8.18, 9.03, 10.57, 9.03, 10.07, 8.01, NA, 11.11, 8.11, 8.71, 10.13, 9.49, 6.63, NA, 10.28, 11.47, 9.5, 7.25, 7, 7.61, 10.5, 8.19, 9.91, 9.74, 8.55, 9.32, 9.69, 8.92, 7.05, 8.63, 9.2, 7.8, 10.79, 10.48, 8.58, 6.83, 8.59, 11.08, 8.52, NA, 10, 9.41, 8.94, 10.21, 10.24, 10.03, 10.1, 10.81, 7.81, 9.56, 7.25, 11.32, 10.31, 10.35, 9.41, 10.49, NA, 10.45, 9.38, 10.77, 10.96, NA, 7.96, 7.9, 9.69, 7.25, 10.32, 9.29, 10.12, 9.7, 7.44, 8.97, 11.12, 10.58, 10.9, 9.92, 8.76, 8.74, NA, 8.21, 7.54)
happyRegion <- c('South Asia', 'Central and Eastern Europe', 'Middle East and North Africa', 'Latin America and Caribbean', 'Commonwealth of Independent States', 'North America and ANZ', 'Western Europe', 'Commonwealth of Independent States', 'Middle East and North Africa', 'South Asia', 'Commonwealth of Independent States', 'Western Europe', 'Sub-Saharan Africa', 'Latin America and Caribbean', 'Central and Eastern Europe', 'Sub-Saharan Africa', 'Latin America and Caribbean', 'Central and Eastern Europe', 'Sub-Saharan Africa', 'Southeast Asia', 'Sub-Saharan Africa', 'Sub-Saharan Africa', 'Sub-Saharan Africa', 'Latin America and Caribbean', 'East Asia', 'Latin America and Caribbean', 'Sub-Saharan Africa', 'Sub-Saharan Africa', 'Latin America and Caribbean', 'Central and Eastern Europe', 'Western Europe', 'Central and Eastern Europe', 'Western Europe', 'Latin America and Caribbean', 'Latin America and Caribbean', 'Middle East and North Africa', 'Latin America and Caribbean', 'Central and Eastern Europe', 'Sub-Saharan Africa', 'Western Europe', 'Western Europe', 'Sub-Saharan Africa', 'Commonwealth of Independent States', 'Western Europe', 'Sub-Saharan Africa', 'Western Europe', 'Latin America and Caribbean', 'Sub-Saharan Africa', 'Latin America and Caribbean', 'Latin America and Caribbean', 'East Asia', 'Central and Eastern Europe', 'Western Europe', 'South Asia', 'Southeast Asia', 'Middle East and North Africa', 'Middle East and North Africa', 'Western Europe', 'Middle East and North Africa', 'Western Europe', 'Sub-Saharan Africa', 'Latin America and Caribbean', 'East Asia', 'Middle East and North Africa', 'Commonwealth of Independent States', 'Sub-Saharan Africa', 'Central and Eastern Europe', 'Middle East and North Africa', 'Commonwealth of Independent States', 'Southeast Asia', 'Central and Eastern Europe', 'Middle East and North Africa', 'Sub-Saharan Africa', 'Middle East and North Africa', 'Central and Eastern Europe', 'Western Europe', 'Central and Eastern Europe', 'Sub-Saharan Africa', 'Sub-Saharan Africa', 'Sub-Saharan Africa', 'Western Europe', 'Sub-Saharan Africa', 'Sub-Saharan Africa', 'Latin America and Caribbean', 'Commonwealth of Independent States', 'East Asia', 'Central and Eastern Europe', 'Middle East and North Africa', 'Sub-Saharan Africa', 'Southeast Asia', 'Sub-Saharan Africa', 'South Asia', 'Western Europe', 'North America and ANZ', 'Latin America and Caribbean', 'Sub-Saharan Africa', 'Sub-Saharan Africa', 'Western Europe', 'South Asia', 'Middle East and North Africa', 'Latin America and Caribbean', 'Latin America and Caribbean', 'Southeast Asia', 'Central and Eastern Europe', 'Western Europe', 'Central and Eastern Europe', 'Commonwealth of Independent States', 'Middle East and North Africa', 'Sub-Saharan Africa', 'Central and Eastern Europe', 'Sub-Saharan Africa', 'Southeast Asia', 'Central and Eastern Europe', 'Central and Eastern Europe', 'Sub-Saharan Africa', 'East Asia', 'Sub-Saharan Africa', 'Western Europe', 'South Asia', 'Western Europe', 'Western Europe', 'East Asia', 'Commonwealth of Independent States', 'Sub-Saharan Africa', 'Southeast Asia', 'Sub-Saharan Africa', 'Latin America and Caribbean', 'Middle East and North Africa', 'Middle East and North Africa', 'Commonwealth of Independent States', 'Sub-Saharan Africa', 'Commonwealth of Independent States', 'Middle East and North Africa', 'Western Europe', 'North America and ANZ', 'Latin America and Caribbean', 'Commonwealth of Independent States', 'Southeast Asia', 'Middle East and North Africa', 'Sub-Saharan Africa', 'Sub-Saharan Africa')
happyIncome <- factor(c('low', 'upper-middle', 'upper-middle', 'high', 'upper-middle', 'high', 'high', 'upper-middle', 'high', 'lower-middle', 'upper-middle', 'high', 'low', 'lower-middle', 'upper-middle', 'upper-middle', 'upper-middle', 'upper-middle', 'low', 'lower-middle', 'lower-middle', 'low', 'low', 'high', 'upper-middle', 'upper-middle', 'NA', 'NA', 'upper-middle', 'high', 'high', 'high', 'high', 'upper-middle', 'upper-middle', 'NA', 'lower-middle', 'high', 'low', 'high', 'high', 'upper-middle', 'lower-middle', 'high', 'lower-middle', 'high', 'upper-middle', 'low', 'low', 'lower-middle', 'NA', 'high', 'high', 'lower-middle', 'lower-middle', 'NA', 'upper-middle', 'high', 'high', 'high', 'NA', 'upper-middle', 'high', 'upper-middle', 'upper-middle', 'lower-middle', 'lower-middle', 'high', 'NA', 'NA', 'high', 'upper-middle', 'low', 'upper-middle', 'high', 'high', 'NA', 'low', 'low', 'low', 'high', 'lower-middle', 'upper-middle', 'upper-middle', 'lower-middle', 'lower-middle', 'upper-middle', 'lower-middle', 'low', 'lower-middle', 'upper-middle', 'low', 'high', 'high', 'lower-middle', 'low', 'lower-middle', 'high', 'lower-middle', 'NA', 'high', 'upper-middle', 'lower-middle', 'high', 'high', 'upper-middle', 'NA', 'high', 'low', 'upper-middle', 'low', 'high', 'NA', 'high', 'upper-middle', 'NA', 'low', 'high', 'lower-middle', 'high', 'high', 'NA', 'low', 'low', 'upper-middle', 'low', 'high', 'lower-middle', 'upper-middle', 'upper-middle', 'low', 'lower-middle', 'high', 'high', 'high', 'high', 'lower-middle', 'lower-middle', 'NA', 'lower-middle', 'low'), levels=c("low", "lower-middle", "upper-middle", "high"))
happySS <- c(0.491, 0.638, 0.807, 0.907, 0.698, 0.95, 0.906, 0.787, 0.876, 0.713, 0.9, 0.922, 0.436, 0.779, 0.775, 0.768, 0.905, 0.942, 0.785, 0.765, 0.695, 0.32, 0.661, 0.88, 0.772, 0.909, 0.655, 0.67, 0.922, 0.77, 0.819, 0.901, 0.952, 0.894, 0.849, 0.638, 0.829, 0.936, 0.734, 0.964, 0.931, 0.807, 0.59, 0.892, 0.669, 0.753, 0.826, 0.634, 0.647, 0.843, 0.831, 0.877, 0.967, 0.607, 0.796, 0.714, 0.695, 0.943, 0.916, 0.92, 0.661, 0.913, 0.882, 0.815, 0.914, 0.715, 0.792, 0.853, 0.883, 0.707, 0.895, 0.777, 0.685, 0.823, 0.926, 0.905, 0.8, 0.626, 0.555, 0.741, 0.937, 0.779, 0.91, 0.8, 0.831, 0.924, 0.881, 0.641, 0.678, 0.795, 0.828, 0.816, 0.937, 0.955, 0.838, 0.582, 0.733, 0.95, 0.69, 0.824, 0.912, 0.83, 0.851, 0.882, 0.9, 0.811, 0.896, 0.84, 0.744, 0.884, 0.652, 0.897, 0.913, 0.928, 0.87, 0.807, 0.557, 0.903, 0.823, 0.914, 0.95, 0.891, 0.663, 0.705, 0.877, 0.508, 0.916, 0.717, 0.876, 0.908, 0.74, 0.858, 0.836, 0.937, 0.921, 0.914, 0.942, NA, 0.79, 0.744, 0.754)
happyCountry <- c('Afghanistan', 'Albania', 'Algeria', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahrain', 'Bangladesh', 'Belarus', 'Belgium', 'Benin', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Bulgaria', 'Burkina Faso', 'Cambodia', 'Cameroon', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Congo (Brazzaville)', 'Congo (Kinshasa)', 'Costa Rica', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Estonia', 'Ethiopia', 'Finland', 'France', 'Gabon', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Haiti', 'Honduras', 'Hong Kong S.A.R. of China', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Ivory Coast', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kosovo', 'Kuwait', 'Kyrgyzstan', 'Laos', 'Latvia', 'Lebanon', 'Liberia', 'Libya', 'Lithuania', 'Luxembourg', 'Macedonia', 'Madagascar', 'Malawi', 'Mali', 'Malta', 'Mauritania', 'Mauritius', 'Mexico', 'Moldova', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Norway', 'Pakistan', 'Palestinian Territories', 'Panama', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Romania', 'Russia', 'Saudi Arabia', 'Senegal', 'Serbia', 'Sierra Leone', 'Singapore', 'Slovakia', 'Slovenia', 'South Africa', 'South Korea', 'South Sudan', 'Spain', 'Sri Lanka', 'Sweden', 'Switzerland', 'Taiwan Province of China', 'Tajikistan', 'Tanzania', 'Thailand', 'Togo', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Turkmenistan', 'Uganda', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United States', 'Uruguay', 'Uzbekistan', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe')
happy <- tibble::tibble(region=happyRegion, life.expectancy=happyLE, social.support=happySS,
happiness=happyHappy, log.gdp=happyGDP, income=happyIncome,
country=happyCountry
)
str(happy)
# Create a coded scatterplot of happiness vs. life.expectancy
happy %>%
plot_ly(x=~life.expectancy, y=~happiness) %>%
add_markers(color=~region, size=~log.gdp)
# Fill in the specified plotting symbols
happy %>%
plot_ly(x = ~life.expectancy, y = ~happiness) %>%
add_markers(symbol = ~income, symbols = c("circle-open", "square-open", "star-open", "x-thin-open"))
# Complete the following code to polish the plot
happy %>%
plot_ly(x = ~social.support, y = ~happiness, hoverinfo = "text",
text = ~paste("Country: ", country, "<br> Income: ", income,
"<br> Happiness: ", round(happiness, 2),
"<br> Social support: ", round(social.support, 2)
)
) %>%
add_markers(symbol = ~income, symbols = c("circle-open", "square-open", "star-open", "x-thin-open")) %>%
layout(xaxis = list(title="Social support index"), yaxis = list(title="National happiness score"))
us_economy <- readr::read_csv("./RInputFiles/state_economic_data.csv")
str(us_economy, give.attr=FALSE)
launches <- readr::read_csv("./RInputFiles/launches.csv")
str(launches, give.attr=FALSE)
# Change the sizemode so that size refers to the diameter of the points
us_economy %>%
filter(year == 2017) %>%
plot_ly(x = ~gdp, y = ~house_price) %>%
add_markers(size = ~population, color = ~region, marker = list(sizemode="diameter"))
# Create a line chart of house_price over time by state
us_economy %>%
filter(year >= 2000) %>%
group_by(state) %>%
plot_ly(x = ~year, y = ~house_price) %>%
add_lines()
Chapter 2 - Animating Graphics
Introduction to Animated Graphics:
Polishing Animations:
Layering in plotly:
Cumulative Animations:
Example code includes:
# Create an animated bubble chart of house_price against gdp
us_economy %>%
plot_ly(x = ~gdp, y = ~house_price) %>%
add_markers(size = ~population, color = ~region, frame = ~year, ids = ~state,
marker = list(sizemode = "diameter")
)
# Animate a bubble chart of house_price against gdp over region
ani <- us_economy %>%
filter(year==2017) %>%
plot_ly(x = ~gdp, y = ~house_price) %>%
add_markers(size = ~population, color = ~region,
frame = ~region, ids = ~state, marker = list(sizemode = "diameter")
)
ani
# Adjust the frame and transition
ani %>%
animation_opts(frame = 2000, transition = 300)
# Change the type of transition to "elastic"
ani %>%
animation_opts(frame = 2000, transition = 300, easing = "elastic")
# Remove the prefix from the slider and change the font color to "red"
ani %>%
animation_opts(frame = 2000, transition = 300, easing = "elastic") %>%
animation_slider(currentvalue = list(prefix = NULL, font = list(color = "red")))
# Polish the x- and y-axis titles
ani %>%
animation_opts(frame = 2000, transition = 300, easing = "elastic") %>%
animation_slider(currentvalue = list(prefix = NULL, font = list(color = "red"))) %>%
layout(xaxis = list(title="Real GDP (millions USD)"), yaxis = list(title="Housing price index"))
# Reduce the bubble size
us_economy %>%
plot_ly(x = ~gdp, y = ~house_price) %>%
add_markers(size = ~population, color = ~region, frame = ~year, ids = ~state,
marker = list(sizemode = "diameter", sizeref=3)
) %>%
layout(xaxis = list(title = "Real GDP (millions USD)", type = "log"),
yaxis = list(title = "Housing price index")
)
# Map state names to the hover info text
us_economy %>%
plot_ly(x = ~gdp, y = ~house_price, hoverinfo = "text", text = ~state) %>%
add_markers(size = ~population, color = ~region, frame = ~year, ids = ~state,
marker = list(sizemode = "diameter", sizeref = 3)
) %>%
layout(xaxis = list(title = "Real GDP (millions USD)", type = "log"),
yaxis = list(title = "Housing price index")
)
# Add the year as background text and remove the slider
us_economy %>%
plot_ly(x = ~gdp, y = ~house_price, hoverinfo = "text", text = ~state) %>%
add_text(x = 200000, y = 450, text = ~year, frame = ~year,
textfont = list(color = toRGB("gray80"), size = 150)
) %>%
add_markers(size = ~population, color = ~region, frame = ~year, ids = ~state,
marker = list(sizemode = "diameter", sizeref = 3)
) %>%
layout(xaxis = list(title = "Real GDP (millions USD)", type = "log"),
yaxis = list(title = "Housing price index")
) %>%
animation_slider(hide = TRUE)
# extract the 1997 data
us1997 <- us_economy %>%
filter(year == 1997)
# create an animated scatterplot with baseline from 1997
us_economy %>%
plot_ly(x = ~gdp, y = ~house_price) %>%
add_markers(data = us1997, marker = list(color = toRGB("gray60"), opacity = 0.5)) %>%
add_markers(frame = ~year, ids = ~state, data = us_economy, showlegend = FALSE, alpha = 0.5) %>%
layout(xaxis = list(type = "log"))
# Find median life HPI for each region in each year
med_hpi <- us_economy %>%
group_by(region, year) %>%
summarize(median_hpi=median(house_price))
# Animate the cumulative time series of median HPI over time
med_hpi %>%
split(.$year) %>%
accumulate(~bind_rows(.x, .y)) %>%
set_names(1997:2017) %>%
bind_rows(.id = "frame") %>%
plot_ly(x=~year, y=~median_hpi, color=~region) %>%
add_lines(frame=~frame, showlegend=FALSE)
Chapter 3 - Linking Graphics
Linking Two Charts:
Brushing Groups:
Selection Strategies:
Making Shinier Charts:
Example code includes:
us2017 <- us_economy %>%
filter(year == 2017) %>%
group_by(state, year, gdp, home_owners, house_price, population, region, division) %>%
summarize(employment=mean(employment)) %>%
ungroup()
str(us2017, give.attr=FALSE)
# Create a SharedData object from us2017
shared_us <- crosstalk::SharedData$new(us2017)
# Create a scatterplot of house_price vs. home_owners
p1 <- shared_us %>%
plot_ly(x = ~home_owners, y = ~house_price) %>%
add_markers()
# Scatterplot of house_price vs. employment rate
p2 <- shared_us %>%
plot_ly(x = ~employment/population, y = ~house_price) %>%
add_markers()
# Polish the linked scatterplots
linked_plots <- subplot(p1, p2, titleX = TRUE, shareY = TRUE) %>% hide_legend()
linked_plots
# Add a highlight layer
linked_plots %>%
highlight()
# Enable linked brushing
linked_plots %>%
highlight(on = "plotly_selected")
# Enable hover highlighting
linked_plots %>%
highlight(on = "plotly_hover")
# Create a shared data object keyed by individual states
state_data <- us_economy %>%
crosstalk::SharedData$new(key=~state)
# Using the shared data, plot house price vs. year
state_data %>%
plot_ly(x=~year, y=~house_price) %>%
# Group by state
group_by(state) %>%
# Add lines
add_lines()
# Create a shared data object keyed by region
shared_region <- us_economy %>%
crosstalk::SharedData$new(key = ~region)
# Create a dotplot of avg house_price by region in 2017
dp_chart <- shared_region %>%
plot_ly() %>%
filter(year == 2017) %>%
group_by(region) %>%
summarize(avg.hpi = mean(house_price, na.rm = TRUE)) %>%
add_markers(x = ~avg.hpi, y = ~region)
# Code for time series plot
ts_chart <- shared_region %>%
plot_ly(x = ~year, y = ~house_price) %>%
group_by(state) %>%
add_lines()
# Link dp_chart and ts_chart
subplot(dp_chart, ts_chart)
# Create a shared data object keyed by division
shared_region <- crosstalk::SharedData$new(us2017, key = ~division)
# Create a bar chart for division
bc <- shared_region %>%
plot_ly() %>%
count(division) %>%
add_bars(x = ~division, y = ~n) %>%
layout(barmode = "overlay")
# Bubble chart
bubble <- shared_region %>%
plot_ly(x = ~home_owners, y = ~house_price, hoverinfo = "text", text = ~state) %>%
add_markers(size = ~population, marker = list(sizemode = "diameter"))
# Remove the legend
subplot(bc, bubble) %>% hide_legend()
# Enable persistent hover selection and a color selector
linked_plots %>%
highlight(persistent = TRUE, selectize = TRUE, dynamic = TRUE, on="plotly_hover")
# Create a shared data object keyed by state
state_data <- crosstalk::SharedData$new(us2017, key = ~state, group = "Select a state")
# Enable indirect selection by state
state_data %>%
plot_ly(x = ~home_owners, y = ~house_price, hoverinfo = "text", text = ~state) %>%
add_markers(size = ~population, marker = list(sizemode = "diameter")) %>%
highlight(selectize = TRUE)
# Create a shared data object keyed by region
region_data <- crosstalk::SharedData$new(us2017, key = ~region, group = "Select a region")
# Enable indirect selection by region
region_data %>%
plot_ly(x = ~home_owners, y = ~house_price, hoverinfo = "text", text = ~state) %>%
add_markers(size = ~population, marker = list(sizemode = "diameter")) %>%
highlight(selectize = TRUE)
# Create a row of subplots containing p97, p07, and p17 with widths 6, 3, 3
# crosstalk::bscols(widths=c(6, 3, 3) , p97, p07, p17)
# Specify that p07 should span 5 columns
# crosstalk::bscols(widths=c(NA, 5, NA), p97, p07, p17)
# Stack p07 and p17 in the right column
# crosstalk::bscols(p97, list(p07, p17))
# shared data object
shared_us <- crosstalk::SharedData$new(us2017, key = ~region)
# scatterplot of housing price index against homeownership
p17 <- shared_us %>%
plot_ly(x = ~home_owners, y = ~house_price, color = ~region, height = 400) %>%
add_markers()
# add a column of checkboxes for region to the left of the plot
crosstalk::bscols(widths=c(3, NA),
crosstalk::filter_checkbox(id = "region", label = "Region",
sharedData = shared_us, group = ~region
), p17
)
shared_us <- crosstalk::SharedData$new(us2017)
p17 <- shared_us %>%
plot_ly(x = ~home_owners, y = ~house_price, color = ~region, height = 400) %>%
add_markers() %>%
layout(xaxis = list(title = "Home ownership (%)"), yaxis = list(title = "HPI"))
# add a slider filter for each axis below the scatterplot
crosstalk::bscols(list(p17,
crosstalk::filter_slider(id = "price", label = "HPI",
sharedData = shared_us, column = ~house_price
),
crosstalk::filter_slider(id = "owners", label = "Home ownership (%)",
sharedData = shared_us, column = ~home_owners
)
)
)
Chapter 4 - Case Study: Space Launches
Introduction to the Data:
Recap: Animation:
Recap: Linked Views and Selector Widgets:
Wrap Up:
Example code includes:
# table of launches by year
launches_by_year <- launches %>%
count(launch_year)
# create a line chart of launches over time
launches_by_year %>%
plot_ly(x=~launch_year, y=~n) %>%
add_lines() %>%
layout(xaxis = list(title = "Year"), yaxis = list(title = "Launches"))
# create a filled area chart of launches over time
launches_by_year %>%
plot_ly(x=~launch_year, y=~n) %>%
add_lines(fill="tozeroy") %>%
layout(xaxis = list(title = "Year"), yaxis = list(title = "Launches"))
# create a bar chart of launches over time
launches_by_year %>%
plot_ly(x=~launch_year, y=~n) %>%
add_bars() %>%
layout(xaxis = list(title = "Year"), yaxis = list(title = "Launches"))
# table of launches by year
state_launches <- launches %>%
filter(agency_type == "state") %>%
count(launch_year, state_code)
# create a ShareData object for plotting
shared_launches <- state_launches %>%
crosstalk::SharedData$new(key = ~state_code)
# Create a line chart for launches by state, with highlighting
shared_launches %>%
plot_ly(x=~launch_year, y=~n, color=~state_code) %>%
add_lines() %>%
highlight()
# table of launches by year and agency type
launches_by_year <- launches %>%
count(launch_year, agency_type)
# create a ShareData object for plotting
shared_launches <- launches_by_year %>%
crosstalk::SharedData$new(key = ~agency_type)
# create a line chart displaying launches by agency type, with highlighting
shared_launches %>%
plot_ly(x=~launch_year, y=~n, color=~agency_type) %>%
add_lines() %>%
highlight()
# Complete the state_launches data set
annual_launches <- launches %>%
filter(agency_type == "state") %>%
count(launch_year, state_code) %>%
tidyr::complete(state_code, launch_year, fill = list(n = 0))
# Create the cumulative data set
cumulative_launches <- annual_launches %>%
split(f = .$launch_year) %>%
accumulate(., ~bind_rows(.x, .y)) %>%
bind_rows(.id = "frame")
# Create the cumulative animation
cumulative_launches %>%
plot_ly(x = ~launch_year, y = ~n) %>%
add_lines(color = ~state_code, frame = ~frame, ids = ~state_code)
# Complete the private_launches data set
annual_launches <- launches %>%
filter(agency_type == "private") %>%
rename(year=launch_year, agency_name=agency) %>%
count(year, agency_name) %>%
tidyr::complete(agency_name, year, fill = list(n = 0))
# Create the cumulative data set
cumulative_launches <- annual_launches %>%
split(f = .$year) %>%
accumulate(., ~bind_rows(.x, .y)) %>%
bind_rows(.id = "frame")
# Create the cumulative animation
cumulative_launches %>%
plot_ly(x = ~year, y = ~n, color = ~agency_name) %>%
add_lines(frame = ~frame, ids = ~agency_name)
# Create a SharedData object allowing selection by year
# shared_year <- crosstalk::SharedData$new(launches, key = ~launch_year)
# Create a bar chart of launches by year
# bar <- shared_year %>%
# plot_ly(x = ~launch_year, y = ~n) %>%
# count(launch_year) %>%
# add_bars() %>%
# layout(barmode = "overlay") %>%
# highlight()
# Create a scatterplot of diameter vs. length
# scatter <- shared_year %>%
# plot_ly(x = ~length, y = ~diameter) %>%
# add_markers() %>%
# highlight()
# Use bscols to link the two charts
# crosstalk::bscols(bar, scatter)
# Create a SharedData object allowing selection of observations
# shared_obs <- crosstalk::SharedData$new(lv2000)
# Create a scatterplot of to_thrust against leo_capacity
# p1 <- shared_obs %>%
# plot_ly(x = ~leo_capacity, y = ~to_thrust) %>%
# add_markers()
# Scatterplot of diameter vs. length
# p2 <- shared_obs %>%
# plot_ly(x = ~length, y = ~diameter) %>%
# add_markers()
# Link p1 and p2
# subplot(p1, p2) %>%
# highlight(on = "plotly_selected", off = "plotly_deselect") %>%
# hide_legend()
# SharedData object allowing selection of observations
# shared_obs <- crosstalk::SharedData$new(lv2000)
# Scatterplot of to_thrust against leo_capacity
# scatter <- shared_obs %>%
# plot_ly(x = ~leo_capacity, y = ~to_thrust) %>%
# add_markers()
# Create a histogram of to_thrust
# histo <- shared_obs %>%
# plot_ly(x = ~to_thrust) %>%
# add_histogram(name = "overall")
# Link the two plots
# subplot(scatter, histo) %>%
# hide_legend() %>%
# highlight(on = "plotly_selected")
# Create a SharedData object containing the number of launches by year and state
shared_launches <- launches %>%
filter(agency_type == "state") %>%
count(state_code, launch_year) %>%
crosstalk::SharedData$new()
# Create a line chart displaying the launches by state
launch_ts <- shared_launches %>%
plot_ly(x = ~launch_year, y = ~n, color = ~state_code) %>%
add_lines()
# Add a slider below the chart to filter the years displayed
crosstalk::bscols(list(launch_ts,
crosstalk::filter_slider(id = "time", label = "Year",
sharedData = shared_launches, column = ~launch_year
)
)
)
Chapter 1 - Avoiding Conflict
Defensive R Programming:
Avoid Reinventing the Wheel:
Packages and Namespaces:
Example code includes:
# Create a data frame of the packages where a newer version is available
old <- old.packages()
# Find how many packages need to be updated
no_of_old_pkgs <- nrow(old)
# Count the number of functions in ggplot2
no_of_functions <- length(getNamespaceExports("ggplot2"))
# Load the dplyr and conflicted packages
library("dplyr")
library("conflicted")
# Prefer the dplyr version of the lag function
conflict_prefer("lag", "dplyr")
# This should return NA, 1, 2, 3
lag(1:4)
Chapter 2 - Early Warning Systems
Early Warning Systems:
Message in a Bottle:
Warning:
Stop:
Example code includes:
# Define F to be interpreted as TRUE
F <- TRUE
# Read in data: don't alter the line below
data_set <- read.csv("iris.csv", header = F)
suppressPackageStartupMessages(library("dplyr"))
# Suppress the standard output of the simulate() function
sim = suppressMessages(simulate(runs = 5))
# Modify the function to make it less noisy
get_distribution <- function(N, verbose = TRUE) {
results <- numeric(N)
for(i in 1:N) {
results[i] <- simulate()
# Check if verbose is TRUE
if(isTRUE(verbose)) {
# Show a progress report
message("Simulation ", i, " completed")
}
}
return(results)
}
# Create new variable
x <- c(1, 1, 1)
y <- 1:3
# Suppress the warning
m <- suppressWarnings(cor(x, y))
mean_age = function(ages) {
if(any(ages < 0)) {
stop("You have negative ages!")
}
# Stop the execution if any of the ages are over 150
if(any(ages > 150)) {
stop("You have ages over 150!")
}
m = mean(ages)
return(m)
}
Chapter 3 - Preparing Defenses
Preparing Defenses:
Comments:
Dotty:
Coding Style:
Static Code Analysis for R:
Example code includes:
m <- mean(x)
s <- sd(x)
n <- length(x)
c(m - 1.96 * s/sqrt(n), m + 1.96 * s/sqrt(n))
m <- mean(y)
s <- sd(y)
n <- length(y)
c(m - 1.96 * s/sqrt(n), m + 1.96 * s/sqrt(n))
# Define a function to prevent pasting the code above
ci <- function(values) {
n <- length(values)
m <- mean(values)
s <- sd(values)
c(m - 1.96*s/sqrt(n), m + 1.96*s/sqrt(n))
}
# Define a function to prevent pasting the code above
ci <- function(x, plot_it = FALSE) {
# Plot the data
if (isTRUE(plot_it)) hist(x)
m <- mean(x)
s <- sd(x)
n <- length(x)
c(m - 1.96 * s/sqrt(n), m + 1.96 * s/sqrt(n))
}
# Generate 100 normal random numbers
sample_values <- rnorm(100)
ci(sample_values)
# Fix the code
f <- function(x, y, z) {
x + y + z
}
# Fix the code
summarise_data <- function(values){
c(mean(values), median(values))
}
stats <- summarise_data(runif(10))
Chapter 4 - Creating a Battle Plan
Battle Plan:
Human Readable Filenames:
Organizing Projects:
Graphics and Output:
Example code includes:
# The load.R file
library("readr")
library("readxl")
# Print the current working directory
getwd()
# Change to relative paths
battles <- read_csv("input/battles.csv")
foes <- read_xlsx("input/foes.xlsx")
library("ggplot2")
library("readr")
# Show the file/directory structure
list.files(".")
# Change to relative paths and load the data
battles <- read_csv("input/battles.csv")
g <- ggplot(battles) +
geom_bar(aes(Location)) # Bar chart
# Change to relative paths and save the data
ggsave(filename = "graphics/locations.pdf", plot = g)
# The load.R file
library("readr")
library("readxl")
# Change to relative paths
battles <- read_csv("input/battles.csv")
foes <- read_xlsx("input/foes.xlsx")
library("ggplot2")
library("readr")
source("R/load.R")
# Create a bar chart
g <- ggplot(battles) +
geom_bar(aes(Location))
# Change to relative paths
ggsave(filename = "graphics/locations.pdf", plot = g)
Chapter 1 - Creating Features from Categorical Data
Introduction to Feature Engineering in R:
Binning Encoding: Content Driven:
Binning Encoding: Data Driven:
Example code includes:
id <- c(3410, 9157, 2250, 2353, 4872, 2929, 4077, 1351, 9596, 4157, 3536, 5742, 5183, 2432, 8359, 3055, 6889, 2850, 6975, 7697, 694, 8580, 7161, 3183, 6188, 8429, 7552, 5562, 5455, 5354, 4126, 3246, 2033, 4444, 3875, 2926, 648, 9110, 546, 1871, 4340, 8645, 5766, 189, 6321, 3414, 5686, 2863, 5853, 9226, 655, 2966, 1176, 3817, 1070, 7258, 7871, 7581, 8098, 1123, 7106, 9532, 8566, 4229, 4285, 544, 407, 8262, 6232, 7946, 6995, 8134, 1512, 2302, 2034, 9555, 9467, 7627, 4558, 7661, 7650, 4326, 8771, 8147, 6613, 5080, 9968, 9908, 4585, 3751, 9588, 8491, 84, 6731, 3652, 5449, 4482, 587, 2620, 5655, 9915, 4243, 8141, 8934, 1908, 4480, 2320, 1612, 9315, 8035, 8970, 6039, 1202, 9223, 9879, 2456, 1633, 2798, 4435, 4774, 7706, 6378, 8537, 1235, 1663, 8698, 2397, 951, 8753, 9802, 4504, 426, 1482, 1157, 8866, 9678, 3122, 5903, 1252, 9669, 6233, 5463, 2952, 8259, 1162, 1006, 5625, 8526, 9584, 9177, 5592, 1489, 6035, 223, 62, 751, 393, 2471, 4813, 7462, 1809, 5089, 2290, 2020, 7408, 7338, 1214, 3690, 3715, 250, 2968, 6546, 2300, 8835, 5976, 2783, 3292, 7578, 5519, 3250, 8831, 2278, 9783, 4970, 2446, 5813, 6017, 8048, 4628, 5550, 4995, 636, 6238, 4767, 8013, 4601, 8460, 9929, 9592, 5148, 4215, 8516, 9725, 6753, 3952, 6038, 2682, 5886, 630, 9006, 2710, 812, 1545, 5011, 7587, 7847, 5184, 5889, 7411, 9559, 7008, 5986, 811, 7515, 3084, 929, 6359, 4879, 144, 1712, 6758, 7142, 746, 4947, 3673, 9261, 2745, 9755, 8489, 4073, 8397, 1297, 2407, 4080, 2713, 5275, 8447, 192, 5542, 1927)
grade <- c(5, 12, 3, 3, 6, 4, 5, 2, 12, 6, 5, 7, 7, 3, 11, 4, 9, 4, 9, 10, 1, 11, 9, 4, 8, 11, 10, 7, 7, 7, 6, 4, 3, 6, 5, 4, 1, 11, 1, 3, 6, 11, 8, 1, 8, 5, 7, 4, 8, 12, 1, 4, 2, 5, 2, 9, 10, 10, 10, 2, 9, 12, 11, 6, 6, 1, 1, 10, 8, 10, 9, 10, 2, 3, 3, 12, 12, 10, 6, 10, 10, 6, 11, 10, 8, 7, 12, 12, 6, 5, 12, 11, 1, 9, 5, 7, 6, 1, 4, 7, 12, 6, 10, 11, 3, 6, 3, 2, 12, 10, 11, 8, 2, 12, 12, 3, 2, 4, 6, 6, 10, 8, 11, 2, 2, 11, 3, 2, 11, 12, 6, 1, 2, 2, 11, 12, 4, 8, 2, 12, 8, 7, 4, 10, 2, 2, 7, 11, 12, 12, 7, 2, 8, 1, 1, 1, 1, 3, 6, 9, 3, 7, 3, 3, 9, 9, 2, 5, 5, 1, 4, 8, 3, 11, 8, 4, 4, 10, 7, 4, 11, 3, 12, 7, 3, 8, 8, 10, 6, 7, 7, 1, 8, 6, 10, 6, 11, 12, 12, 7, 6, 11, 12, 9, 5, 8, 4, 8, 1, 11, 4, 1, 2, 7, 10, 10, 7, 8, 9, 12, 9, 8, 1, 10, 4, 2, 8, 6, 1, 3, 9, 9, 1, 7, 5, 12, 4, 12, 11, 5, 11, 2, 3, 5, 4, 7, 11, 1, 7, 3)
gender <- factor(c('Female', 'Female', 'Female', 'Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Female', 'Female', 'Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Female', 'Female', 'Female', 'Female', 'Male', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male', 'Female', 'Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Male'))
discipline <- c(0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0)
infraction <- c('academic dishonesty', 'disruptive conduct', 'failure to cooperate', 'failure to cooperate', 'alcohol', 'failure to cooperate', 'fighting', 'disruptive conduct', 'fighting', 'academic dishonesty', 'failure to cooperate', 'failure to cooperate', 'failure to cooperate', 'fighting', 'plagiarism', 'failure to cooperate', 'failure to cooperate', 'failure to cooperate', 'academic dishonesty', 'fighting', 'failure to cooperate', 'alcohol', 'fighting', 'failure to cooperate', 'failure to cooperate', 'alcohol', 'academic dishonesty', 'failure to cooperate', 'failure to cooperate', 'failure to cooperate', 'fighting', 'fighting', 'failure to cooperate', 'failure to cooperate', 'fighting', 'failure to cooperate', 'failure to cooperate', 'fighting', 'disruptive conduct', 'failure to cooperate', 'academic dishonesty', 'academic dishonesty', 'disruptive conduct', 'alcohol', 'failure to cooperate', 'disruptive conduct', 'academic dishonesty', 'failure to cooperate', 'failure to cooperate', 'fighting', 'failure to cooperate', 'failure to cooperate', 'academic dishonesty', 'fighting', 'vandalism', 'failure to cooperate', 'minor incident', 'fighting', 'failure to cooperate', 'failure to cooperate', 'failure to cooperate', 'failure to cooperate', 'fighting', 'academic dishonesty', 'disruptive conduct', 'plagiarism', 'academic dishonesty', 'fighting', 'failure to cooperate', 'minor incident', 'academic dishonesty', 'fighting', 'failure to cooperate', 'failure to cooperate', 'failure to cooperate', 'minor incident', 'failure to cooperate', 'failure to cooperate', 'academic dishonesty', 'fighting', 'fighting', 'failure to cooperate', 'failure to cooperate', 'failure to cooperate', 'failure to cooperate', 'fighting', 'fighting', 'failure to cooperate', 'failure to cooperate', 'failure to cooperate', 'alcohol', 'fighting', 'failure to cooperate', 'failure to cooperate', 'failure to cooperate', 'fighting', 'academic dishonesty', 'disruptive conduct', 'plagiarism', 'disruptive conduct', 'alcohol', 'failure to cooperate', 'fighting', 'fighting', 'disruptive conduct', 'alcohol', 'academic dishonesty', 'failure to cooperate', 'alcohol', 'fighting', 'failure to cooperate', 'failure to cooperate', 'vandalism', 'failure to cooperate', 'failure to cooperate', 'academic dishonesty', 'failure to cooperate', 'disruptive conduct', 'failure to cooperate', 'failure to cooperate', 'alcohol', 'academic dishonesty')
infraction <- c(infraction, 'fighting', 'failure to cooperate', 'failure to cooperate', 'minor incident', 'failure to cooperate', 'disruptive conduct', 'alcohol', 'fighting', 'academic dishonesty', 'academic dishonesty', 'failure to cooperate', 'failure to cooperate', 'fighting', 'fighting', 'failure to cooperate', 'failure to cooperate', 'failure to cooperate', 'alcohol', 'academic dishonesty', 'vandalism', 'failure to cooperate', 'fighting', 'failure to cooperate', 'disruptive conduct', 'failure to cooperate', 'alcohol', 'fighting', 'disruptive conduct', 'fighting', 'disruptive conduct', 'academic dishonesty', 'disruptive conduct', 'failure to cooperate', 'failure to cooperate', 'failure to cooperate', 'failure to cooperate', 'disruptive conduct', 'fighting', 'fighting', 'failure to cooperate', 'failure to cooperate', 'academic dishonesty', 'fighting', 'fighting', 'disruptive conduct', 'academic dishonesty', 'alcohol', 'disruptive conduct', 'failure to cooperate', 'failure to cooperate', 'academic dishonesty', 'minor incident', 'failure to cooperate', 'academic dishonesty', 'failure to cooperate', 'fighting', 'fighting', 'vandalism', 'fighting', 'failure to cooperate', 'failure to cooperate', 'academic dishonesty', 'failure to cooperate', 'vandalism', 'fighting', 'minor incident', 'plagiarism', 'minor incident', 'academic dishonesty', 'failure to cooperate', 'failure to cooperate', 'alcohol', 'alcohol', 'academic dishonesty', 'alcohol', 'fighting', 'fighting', 'alcohol', 'failure to cooperate', 'minor incident', 'alcohol', 'fighting', 'failure to cooperate', 'academic dishonesty', 'fighting', 'failure to cooperate', 'disruptive conduct', 'failure to cooperate', 'disruptive conduct', 'failure to cooperate', 'failure to cooperate', 'failure to cooperate', 'academic dishonesty', 'failure to cooperate', 'failure to cooperate', 'academic dishonesty', 'failure to cooperate', 'fighting', 'academic dishonesty', 'failure to cooperate', 'failure to cooperate', 'plagiarism', 'fighting', 'academic dishonesty', 'disruptive conduct', 'academic dishonesty', 'academic dishonesty', 'vandalism', 'failure to cooperate', 'fighting', 'failure to cooperate', 'failure to cooperate', 'failure to cooperate', 'fighting', 'failure to cooperate', 'failure to cooperate', 'alcohol', 'disruptive conduct', 'failure to cooperate', 'academic dishonesty', 'failure to cooperate', 'failure to cooperate', 'failure to cooperate', 'failure to cooperate', 'fighting', 'alcohol', 'academic dishonesty', 'failure to cooperate')
times <- c(1473938153, 1483110896, 1489060088, 1493899994, 1493909505, 1483614394, 1492518486, 1494502453, 1495209565, 1494590293, 1483704752, 1476705077, 1481545093, 1489416862, 1493390730, 1478088653, 1476705910, 1485259499, 1480431250, 1492614726, 1486539571, 1478700385, 1484581056, 1482755357, 1483446278, 1482247343, 1474383858, 1473682392, 1481283681, 1485173234, 1489503848, 1477492361, 1485951473, 1491562724, 1477914755, 1491835059, 1484295102, 1478011951, 1478592346, 1482493816, 1486468634, 1475075730, 1481633206, 1478765263, 1485173006, 1487245747, 1477395294, 1490617068, 1491828436, 1484238621, 1476085966, 1492778297, 1491910329, 1479739916, 1484653465, 1485788584, 1495206822, 1482762803, 1480604700, 1490701662, 1475679546, 1491923600, 1482503451, 1474974934, 1494851233, 1487058134, 1489389522, 1490191070, 1484743053, 1491922384, 1477492959, 1487602919, 1494850933, 1495717272, 1487330996, 1495726535, 1478531164, 1488290353, 1472818971, 1490193538, 1496153591, 1489582550, 1494339621, 1478530685, 1485432998, 1490715336, 1491234140, 1481122391, 1479987613, 1489577270, 1492183462, 1479223633, 1480577585, 1493810573, 1474978111, 1476282532, 1473941219, 1474360465, 1492516515, 1477999051, 1474470475, 1478780271, 1487256863, 1487079493, 1495538105, 1496156561, 1486468944, 1494246393, 1475072152, 1473090763, 1488381954, 1488540975, 1492688980, 1478012745, 1481643977, 1478085901, 1475582009, 1489578677, 1490357943, 1489579269, 1478186519, 1479470482, 1485959227, 1484049999, 1494329765, 1481296561, 1493900316, 1475654478, 1481207010, 1480430874, 1476445526, 1483949281, 1479731318, 1480592314, 1476369532, 1490193865, 1487334057, 1473941362, 1478090382, 1477666074, 1479471905, 1473421548, 1486555602, 1493304379, 1485346538, 1476952402, 1485428735, 1486566600, 1487253880, 1474554634, 1476707895, 1493987697, 1484567632, 1482305579, 1479974396, 1479369745, 1485933183, 1491392716, 1475582850, 1479393434, 1484061732, 1488541793, 1474460983, 1488540387, 1493908692, 1475074857, 1486641048, 1494417673, 1491564900, 1481096843, 1487595204, 1493644827, 1476879918, 1480086060, 1493900260, 1490095752, 1496317517, 1478791768, 1493304308, 1485528923, 1483970062, 1473768259, 1490024346, 1489145829, 1494415660, 1483360369, 1481899428, 1490195704, 1474976116, 1478012482, 1487772608, 1495097185, 1493206536, 1484575342, 1478700512, 1479987287, 1492437830, 1483458545, 1480346620, 1493639689, 1485432736, 1477924233, 1491579799, 1479395782, 1476793778, 1493120617, 1476978304, 1496143032, 1482913710, 1489592461, 1479121859, 1478678802, 1486639883, 1485172303, 1486045483, 1485961556, 1473939375, 1493985609, 1475509067, 1485441623, 1491835910, 1491393930, 1480663866, 1479134060, 1492600360, 1485329189, 1494591915, 1476185444, 1493797262, 1489061537, 1482320494, 1484925313, 1476947699, 1480595285, 1485171879, 1479998225, 1485518119, 1481210950, 1496158904, 1478520382, 1487259327, 1476445306, 1486382276, 1485776699, 1475235680, 1493641377, 1477495455, 1487775512, 1492603708, 1479815502)
samps <- c(3230, 6560, 2134, 2228, 7145, 2773, 3865, 1280, 9692, 3935, 3348, 5202, 4754, 6900, 8771, 2893, 5992, 2698, 7739, 8273, 657, 8947, 7877, 3015, 5569, 8823, 6160, 5062, 4968, 4892, 6987, 6942, 1925, 4167, 3674, 6927, 615, 9339, 516, 1772, 4083, 6423, 5221, 180, 5675, 3234, 5159, 2711, 5296, 9419, 622, 2808, 1112, 6970, 1015, 6076, 8401, 8187, 8570, 1064, 7833, 6658, 8935, 3992, 4041, 514, 386, 8695, 5602, 8466, 7755, 8595, 1431, 2181, 1926, 9661, 6643, 6177, 4253, 8242, 8234, 4072, 6459, 6316, 5911, 7177, 9974, 6753, 4272, 3553, 9685, 8869, 79, 5951, 3457, 7257, 4197, 555, 2483, 5133, 9931, 4004, 8598, 9200, 1805, 7056, 2196, 1528, 9486, 8530, 6515, 5448, 1137, 6578, 6748, 2325, 1548, 2651, 4160, 4424, 8281, 5724, 8909, 1168, 1577, 9031, 2269, 901, 9070, 9844, 4209, 404, 1403, 1094, 9147, 9757, 2956, 5339, 1185, 9750, 5603, 4975, 2795, 8692, 1099, 953, 5106, 8901, 9682, 6566, 5083, 1410, 5445, 211, 59, 713, 372, 2340, 4455, 8101, 6866, 4681, 2171, 1912, 8065, 8017, 1148, 3494, 3519, 236, 2810, 5857, 2179, 9128, 5395, 2636, 3118, 8184, 7267, 6943, 9126, 2160, 6725, 4584, 2316, 5263, 7357, 8540, 4307, 7271, 7162, 604, 5606, 7120, 8517, 4287, 8846, 9942, 9689, 4727, 3980, 8891, 9787, 7568, 3747, 5447, 6912, 5325, 598, 9252, 2567, 771, 1464, 4619, 8193, 6237, 4755, 5327, 6115, 9663, 6015, 5404, 770, 8135, 2919, 881, 5706, 4504, 137, 1622, 5958, 7861, 708, 4564, 3478, 9445, 2601, 6719, 8867, 3861, 6371, 1227, 2279, 3868, 2570, 4833, 8836, 6781, 5043, 1824)
discipline_logs <- tibble::tibble(id, grade, discipline, gender,
infraction=factor(infraction),
timestamp=as.POSIXct(times, origin="1970-01-01", tz="UTC")
)
glimpse(discipline_logs)
## Observations: 250
## Variables: 6
## $ id <dbl> 3410, 9157, 2250, 2353, 4872, 2929, 4077, 1351, 9596, 41...
## $ grade <dbl> 5, 12, 3, 3, 6, 4, 5, 2, 12, 6, 5, 7, 7, 3, 11, 4, 9, 4,...
## $ discipline <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,...
## $ gender <fct> Female, Female, Female, Female, Male, Male, Male, Female...
## $ infraction <fct> academic dishonesty, disruptive conduct, failure to coop...
## $ timestamp <dttm> 2016-09-15 11:15:53, 2016-12-30 15:14:56, 2017-03-09 11...
discipline_logs <- discipline_logs %>%
mutate(male = ifelse(gender == "Male", 1, 0), female = ifelse(gender == "Female", 1, 0))
# Create a new column with the proper string encodings
discipline_logs_new <- discipline_logs %>%
mutate(school_type = case_when(grade >= 1 & grade <= 5 ~ "elementary_school",
grade >= 6 & grade <= 8 ~ "middle_school",
grade <= 12 & grade >= 9 ~ "high_school"
)
)
# Look at a table of the new column
discipline_logs_new %>%
select(school_type) %>%
table()
## .
## elementary_school high_school middle_school
## 99 85 66
discipline_logs_new <- discipline_logs_new %>%
mutate(elem_sch = ifelse(school_type == "elementary_school", 1, 0),
mid_sch = ifelse(school_type == "middle_school", 1, 0),
high_sch = ifelse(school_type == "high_school", 1, 0)
)
# Create a table of the frequencies
discipline_table <- discipline_logs %>%
select(grade, discipline) %>%
table()
# Create a table of the proportions
prop_table <- prop.table(discipline_table, 1)
dgr_prop <- discipline_logs %>%
group_by(grade) %>%
summarize(proportion=mean(discipline))
dgr_prop
## # A tibble: 12 x 2
## grade proportion
## <dbl> <dbl>
## 1 1 0.0455
## 2 2 0
## 3 3 0.0952
## 4 4 0.190
## 5 5 0.0714
## 6 6 0.182
## 7 7 0.217
## 8 8 0.0476
## 9 9 0.6
## 10 10 0.818
## 11 11 0.826
## 12 12 0.64
# Combine the proportions and discipline logs data
discipline <- inner_join(discipline_logs, dgr_prop, by = "grade")
# Display a glimpse of the new data frame
glimpse(discipline)
## Observations: 250
## Variables: 9
## $ id <dbl> 3410, 9157, 2250, 2353, 4872, 2929, 4077, 1351, 9596, 41...
## $ grade <dbl> 5, 12, 3, 3, 6, 4, 5, 2, 12, 6, 5, 7, 7, 3, 11, 4, 9, 4,...
## $ discipline <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,...
## $ gender <fct> Female, Female, Female, Female, Male, Male, Male, Female...
## $ infraction <fct> academic dishonesty, disruptive conduct, failure to coop...
## $ timestamp <dttm> 2016-09-15 11:15:53, 2016-12-30 15:14:56, 2017-03-09 11...
## $ male <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,...
## $ female <dbl> 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ proportion <dbl> 0.0714, 0.6400, 0.0952, 0.0952, 0.1818, 0.1905, 0.0714, ...
# Create a new column with three levels using the proportions as ranges
discipline_ed <- discipline %>%
mutate(education_levels = case_when(proportion >= 0 & proportion <= .20 ~ "low_grade",
proportion > .20 & proportion <= .25 ~ "middle_grade",
proportion > .25 & proportion <= 1 ~ "high_grade"
)
)
glimpse(discipline_ed)
## Observations: 250
## Variables: 10
## $ id <dbl> 3410, 9157, 2250, 2353, 4872, 2929, 4077, 1351, 95...
## $ grade <dbl> 5, 12, 3, 3, 6, 4, 5, 2, 12, 6, 5, 7, 7, 3, 11, 4,...
## $ discipline <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,...
## $ gender <fct> Female, Female, Female, Female, Male, Male, Male, ...
## $ infraction <fct> academic dishonesty, disruptive conduct, failure t...
## $ timestamp <dttm> 2016-09-15 11:15:53, 2016-12-30 15:14:56, 2017-03...
## $ male <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,...
## $ female <dbl> 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,...
## $ proportion <dbl> 0.0714, 0.6400, 0.0952, 0.0952, 0.1818, 0.1905, 0....
## $ education_levels <chr> "low_grade", "high_grade", "low_grade", "low_grade...
Chapter 2 - Creating Features from Numeric Data
Numerical Bucketing or Binning:
Binning Numerical Data Using Quantiles:
Date and Time Feature Extraction
Example code includes:
samps <- c(1115, 3501, 3729, 3314, 1269, 3992, 443, 643, 3509, 3795, 3020, 1011, 8, 696, 419, 2159, 228, 4319, 1154, 4927, 1477, 4127, 4413, 2767, 2156, 2242, 3810, 1430, 2695, 4422, 2168, 1348, 1642, 2315, 4576, 400, 4062, 4273, 848, 3838, 1162, 2721, 3860, 3873, 922, 2844, 1182, 2907, 1759, 184, 227, 3407, 3571, 4852, 3487, 4254, 2455, 430, 15, 3942, 4267, 2497, 45, 2213, 4114, 1807, 3107, 1095, 3391, 1889, 2642, 2246, 101, 4134, 3057, 3603, 2847, 3980, 3784, 195, 645, 4762, 1321, 2524, 2496, 1388, 12, 292, 2881, 3214, 216, 3982, 67, 2348, 1765, 1784, 1499, 2925, 4984, 648, 3773, 2711, 3945, 3780, 3995, 83, 801, 2684, 3604, 3271, 1101, 2264, 1487, 4541, 936, 4365, 3013, 2442, 3334, 2311, 4722, 2895, 4775, 3100, 3599, 1986, 2061, 3415, 4111, 2358, 3825, 535, 2539, 3738, 2736, 2478, 4646, 4452, 2601, 1790, 2429, 1246, 3849, 4072, 3480, 512, 2921, 4881, 3752, 2281, 1970, 382, 4368, 4855, 3001, 1010, 4492, 1433, 3806, 3238, 81, 1906, 2127, 389, 3581, 3940, 2620, 4565, 117, 4057, 2025, 4814, 364, 3794, 4545, 4399, 589, 2550, 2094, 802, 3665, 4540, 2409, 3774, 3679, 2073, 2212, 222, 970, 432, 3808, 4518, 4544, 3311, 3094, 2514, 2890, 3675, 3551, 1572, 4614, 4128, 2752, 1924, 3689, 4367, 77, 1345, 885, 974, 1241, 4272, 1068, 1096, 4233, 4819, 554, 4642, 780, 3863, 437, 1580, 935, 2649, 2481, 1869, 2380, 1169, 3809, 4900, 3931, 4437, 4213, 2657, 2146, 1191, 1896, 3984, 1252, 4094, 4670, 3036, 4948, 1220, 4137, 4721, 3939, 3983, 3432, 224)
Quantity <- c(2, 2, 3, 3, 20, 10, 3, 1, 6, 48, 1, 1, 1, 2, 6, 1, 3, 5, 1, 24, 1, 1, 4, 6, 12, 3, 1, 3, 6, 8, 2, 3, 3, 1, 1, 2, 2, 12, 4, 12, 16, 1, 1, 2, 2, 1, 12, 2, 6, 1, 31, 1, 1, 20, 3, 12, 3, 12, 1, 1, 6, 2, 12, 6, 4, 1, 2, 4, 12, 8, 6, 3, 1, 6, 6, 16, 1, 6, 1, 3, 24, 1, 3, 2, 1, 1, 3, 6, 1, 5, 48, 20, 36, 2, 9, 1, 12, 8, 5, 1, 1, 1, 12, 1, 12, 3, 2, 2, 6, 1, 1, 6, 24, 6, 1, 6, 3, 4, 10, 1, 3, 1, 1, 10, 2, 5, 6, 6, 1, 12, 12, 24, 3, 1, 2, 1, 10, 2, 3, 48, 6, 24, 12, 6, 1, 2, 1, 3, 1, 4, 2, 8, 6, 12, 2, 1, 1, 2, 24, 1, 6, 4, 1, 7, 8, 1, 3, 12, 2, 5, 36, 4, 1, 24, 4, 20, 2, 12, 3, 4, 1, 6, 2, 1, 1, 1, 24, 24, 2, 6, 3, 4, 12, 2, 12, 1, 12, 1, 2, 10, 2, 48, 2, 6, 6, 1, 48, 6, 1, 4, 48, 6, 24, 3, 1, 2, 1, 1, 8, 12, 16, 12, 3, 1, 12, 12, 6, 8, 24, 2, 2, 4, 2, 1, 2, 3, 1, 1, 2, 1, 24, 1, 2, 6, 4, 1, 3, 6, 12, 3)
online_retail <- tibble::tibble(Quantity)
glimpse(online_retail)
## Observations: 250
## Variables: 1
## $ Quantity <dbl> 2, 2, 3, 3, 20, 10, 3, 1, 6, 48, 1, 1, 1, 2, 6, 1, 3, 5, 1...
# Summarize the Quantity variable
online_retail %>%
select(Quantity) %>%
summary()
## Quantity
## Min. : 1.0
## 1st Qu.: 1.0
## Median : 3.0
## Mean : 6.9
## 3rd Qu.: 8.0
## Max. :48.0
# Create a histogram of the possible variable values
ggplot(online_retail, aes(x = Quantity)) +
geom_histogram(stat = "count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
# Create a sequence of numbers to capture the Quantity range
seq(1, 46, by=5)
## [1] 1 6 11 16 21 26 31 36 41 46
# Use the cut function to create a variable quant_cat
online_retail <- online_retail %>%
mutate(quant_cat = cut(Quantity, breaks = seq(1, 50, by = 5)))
# Create a table of the new column quant_cat
online_retail %>%
select(quant_cat) %>%
table()
## .
## (1,6] (6,11] (11,16] (16,21] (21,26] (26,31] (31,36] (36,41] (41,46]
## 115 14 28 4 12 1 2 0 0
# Create new columns from the quant_cat feature
head(model.matrix(~ quant_cat - 1, data = online_retail))
## quant_cat(1,6] quant_cat(6,11] quant_cat(11,16] quant_cat(16,21]
## 1 1 0 0 0
## 2 1 0 0 0
## 3 1 0 0 0
## 4 1 0 0 0
## 5 0 0 0 1
## 6 0 1 0 0
## quant_cat(21,26] quant_cat(26,31] quant_cat(31,36] quant_cat(36,41]
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## quant_cat(41,46]
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
# Break the Quantity variable into 3 buckets
online_retail <- online_retail %>%
mutate(quant_q = ntile(Quantity, 3))
# Use table to look at the new variable
online_retail %>%
select(quant_q) %>%
table()
## .
## 1 2 3
## 84 83 83
# Use table to look at the new variable
online_retail %>%
select(quant_q) %>%
table()
## .
## 1 2 3
## 84 83 83
# Specify a full rank representation of the new column
head(model.matrix(~ quant_q, data = online_retail))
## (Intercept) quant_q
## 1 1 1
## 2 1 1
## 3 1 2
## 4 1 2
## 5 1 3
## 6 1 3
# Look at the column timestamp
discipline_logs %>%
select(timestamp) %>%
glimpse()
## Observations: 250
## Variables: 1
## $ timestamp <dttm> 2016-09-15 11:15:53, 2016-12-30 15:14:56, 2017-03-09 11:...
# Assign date format to the timestamp_date column
discipline_logs <- discipline_logs %>%
mutate(timestamp_date=lubridate::ymd_hms(timestamp))
# Create new column dow (day of the week)
discipline_logs <- discipline_logs %>%
mutate(dow = lubridate::wday(timestamp_date, label = TRUE))
head(discipline_logs)
## # A tibble: 6 x 10
## id grade discipline gender infraction timestamp male female
## <dbl> <dbl> <dbl> <fct> <fct> <dttm> <dbl> <dbl>
## 1 3410 5 0 Female academic ~ 2016-09-15 11:15:53 0 1
## 2 9157 12 0 Female disruptiv~ 2016-12-30 15:14:56 0 1
## 3 2250 3 0 Female failure t~ 2017-03-09 11:48:08 0 1
## 4 2353 3 0 Female failure t~ 2017-05-04 12:13:14 0 1
## 5 4872 6 1 Male alcohol 2017-05-04 14:51:45 1 0
## 6 2929 4 0 Male failure t~ 2017-01-05 11:06:34 1 0
## # ... with 2 more variables: timestamp_date <dttm>, dow <ord>
# Create new column hod (hour of day)
discipline_logs <- discipline_logs %>%
mutate(hod = lubridate::hour(timestamp_date))
head(discipline_logs)
## # A tibble: 6 x 11
## id grade discipline gender infraction timestamp male female
## <dbl> <dbl> <dbl> <fct> <fct> <dttm> <dbl> <dbl>
## 1 3410 5 0 Female academic ~ 2016-09-15 11:15:53 0 1
## 2 9157 12 0 Female disruptiv~ 2016-12-30 15:14:56 0 1
## 3 2250 3 0 Female failure t~ 2017-03-09 11:48:08 0 1
## 4 2353 3 0 Female failure t~ 2017-05-04 12:13:14 0 1
## 5 4872 6 1 Male alcohol 2017-05-04 14:51:45 1 0
## 6 2929 4 0 Male failure t~ 2017-01-05 11:06:34 1 0
## # ... with 3 more variables: timestamp_date <dttm>, dow <ord>, hod <int>
# Create histogram of hod
discipline_logs %>%
ggplot(aes(x=hod)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Chapter 3 - Transforming Numerical Features
Box and Yeo Transformations:
Normalization Techniques:
Z-score Standardization:
Example code includes:
defense <- c(49, 63, 83, 43, 58, 78, 65, 80, 100, 35, 55, 50, 30, 50, 40, 40, 55, 75, 35, 60, 30, 65, 44, 69, 40, 55, 85, 110, 52, 67, 87, 40, 57, 77, 48, 73, 40, 75, 20, 45, 35, 70, 55, 70, 85, 55, 80, 50, 60, 25, 50, 35, 60, 48, 78, 35, 60, 45, 80, 40, 65, 95, 15, 30, 45, 50, 70, 80, 35, 50, 65, 35, 65, 100, 115, 130, 55, 70, 65, 110, 70, 95, 55, 45, 70, 55, 80, 50, 75, 100, 180, 30, 45, 60, 160, 45, 70, 90, 115, 50, 70, 80, 85, 95, 110, 53, 79, 75, 95, 120, 95, 120, 5, 115, 80, 70, 95, 60, 65, 55, 85, 65, 80, 35, 57, 57, 100, 95, 55, 79, 80, 48, 50, 60, 60, 60, 70, 100, 125, 90, 105, 65, 65, 100, 85, 90, 45, 65, 95, 90, 100, 65, 80, 100, 43, 58, 78, 64, 80, 100, 34, 64, 30, 50, 30, 50, 40, 70, 80, 38, 58, 15, 28, 15, 65, 85, 45, 70, 40, 55, 85, 95, 50, 80, 115, 75, 40, 50, 70, 55, 30, 55, 45, 45, 85, 60, 110, 42, 80, 60, 48, 58, 65, 90, 140, 70, 105, 200, 50, 75, 85, 100, 230, 75, 55, 50, 75, 40, 120, 40, 80, 95, 35, 75, 45, 70, 140, 30, 50, 95, 60, 120, 90, 62, 35, 35, 95, 15, 37, 37, 105, 10, 75, 85, 115, 50, 70, 110, 130, 90, 100, 35, 45, 65, 40, 60, 70, 50, 70, 90, 35, 70, 41, 61, 35, 55, 50, 55, 70, 30, 50, 70, 50, 40, 60, 30, 60, 30, 100, 25, 35, 65, 32, 62, 60, 80, 60, 80, 100, 90, 45, 45, 23, 43, 63, 30, 60, 40, 135, 45, 65, 75, 85, 100, 140, 180, 55, 75, 40, 60, 40, 50, 75, 75, 45, 53, 83, 20, 40, 35, 45, 40, 70, 140, 35, 65, 60, 45, 50, 80, 40, 60, 60, 90, 60, 60, 65, 85, 43, 73, 65, 85, 55, 105, 77, 97, 50, 100, 20, 79, 70, 70, 35, 65, 90, 130, 83, 80, 60, 48, 50, 80, 50, 70, 90, 85, 105, 105, 130, 55, 60, 100, 80, 80, 100, 130, 200, 100, 150, 90, 80, 90, 140, 90, 100, 50, 64, 85, 105, 44, 52, 71, 53, 68, 88, 30, 50, 70, 40, 60, 41, 51, 34, 49, 79, 35, 65, 40, 60, 118, 168, 45, 85, 50, 42, 102, 70, 35, 55, 45, 70, 48, 68, 66, 34, 44, 44, 84, 60, 52, 42, 64, 50, 47, 67, 86, 116, 95, 45, 5, 45, 108, 45, 65, 95, 40, 40, 70, 78, 118, 90, 110, 40, 65, 72, 56, 76, 50, 50, 75, 65, 115, 95, 130, 125, 67, 67, 95, 86, 130, 110, 125, 80, 70, 65, 145, 135, 70, 77, 130, 105, 70, 120, 100, 106, 110, 120, 120, 80, 100, 90, 100, 120, 100, 55, 75, 95, 45, 55, 65, 45, 60, 85, 39, 69, 45, 65, 90, 37, 50, 48, 63, 48, 63, 48, 63, 45, 85, 50, 62, 80, 32, 63, 85, 105, 130, 43, 55, 40, 60, 86, 55, 85, 95, 40, 55, 75, 85, 75, 70, 90, 80, 59, 99, 89, 60, 85, 50, 75, 65, 35, 45, 80, 45, 55, 67, 85, 125, 70, 115, 80, 85, 145, 103, 133, 45, 65, 62, 82, 40, 60, 40, 60, 50, 70, 95, 40, 50, 75, 50, 63, 50, 65, 85, 50, 70, 60, 45, 105)
defense <- c(defense, 45, 70, 50, 70, 80, 50, 60, 91, 131, 70, 95, 115, 40, 70, 80, 55, 75, 55, 60, 90, 60, 70, 90, 40, 80, 50, 85, 40, 84, 50, 60, 90, 50, 80, 70, 100, 95, 50, 75, 75, 105, 66, 112, 50, 70, 90, 55, 65, 129, 90, 72, 70, 70, 100, 120, 90, 90, 90, 77, 95, 65, 95, 122, 40, 58, 72, 40, 52, 67, 38, 77, 43, 55, 71, 40, 60, 50, 58, 72, 39, 47, 68, 48, 62, 62, 78, 60, 54, 76, 100, 150, 150, 60, 72, 66, 86, 53, 88, 67, 115, 60, 90, 62, 88, 33, 52, 77, 119, 50, 72, 65, 75, 57, 150, 35, 53, 70, 91, 48, 76, 70, 122, 85, 184, 35, 80, 95, 95, 121, 150, 60, 120, 55, 75, 75, 40, 50, 90, 54, 69, 74, 30, 50, 75, 30, 60, 45, 95, 90, 57, 77, 70, 40, 60, 40, 65, 20, 62, 152, 70, 100, 52, 92, 35, 90, 55, 80, 40, 60, 50, 80, 38, 48, 98, 90, 80, 90, 40, 140, 80, 110, 130, 95, 95, 100, 65, 135, 63, 80, 70, 85, 100, 65, 90, 125, 85, 75, 115, 115, 31, 131, 107, 89, 47, 139, 37, 71, 103, 131, 53, 101, 115, 80, 123, 111, 78, 120, 40, 80, 35, 70, 70, 40, 40, 40, 40, 40, 40, 50, 90, 120, 40, 75, 30, 60, 35, 60, 65, 100, 115, 130, 180, 50, 75, 80, 85, 110, 100, 120, 109, 85, 100, 70, 105, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 230, 140, 115, 90, 150, 75, 80, 110, 65, 125, 125, 230, 85, 80, 70, 100, 110, 70, 70, 70, 75, 60, 80, 130, 150, 120, 100, 90, 160, 100, 20, 160, 90, 45, 45, 105, 95, 70, 48, 68, 94, 115, 88, 105, 95, 107, 107, 107, 107, 107, 100, 75, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 126, 65, 105, 50, 50, 50, 70, 70, 70, 80, 70, 90, 90, 100, 90, 90, 95, 95, 95, 95, 67, 67, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 39, 39, 39, 39, 47, 47, 47, 47, 67, 68, 68, 68, 68, 60, 60, 60, 60, 60, 60, 60, 60, 60, 76, 50, 70, 70, 70, 122, 122, 122, 95, 71, 71, 121, 121, 110, 60, 60, 90, 70, 70, 70, 75, 130, 90, 60, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 100, 100, 100, 100, 100, 100, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 125, 115)
speed <- c(45, 60, 80, 65, 80, 100, 43, 58, 78, 45, 30, 70, 50, 35, 75, 56, 71, 101, 72, 97, 70, 100, 55, 80, 90, 110, 40, 65, 41, 56, 76, 50, 65, 85, 35, 60, 65, 100, 20, 45, 55, 90, 30, 40, 50, 25, 30, 45, 90, 95, 120, 90, 115, 55, 85, 70, 95, 60, 95, 90, 90, 70, 90, 105, 120, 35, 45, 55, 40, 55, 70, 70, 100, 20, 35, 45, 90, 105, 15, 30, 45, 70, 60, 75, 100, 45, 70, 25, 50, 40, 70, 80, 95, 110, 70, 42, 67, 50, 75, 100, 150, 40, 55, 35, 45, 87, 76, 30, 35, 60, 25, 40, 50, 60, 90, 60, 85, 63, 68, 85, 115, 90, 105, 95, 105, 93, 85, 110, 80, 81, 60, 48, 55, 65, 130, 65, 40, 35, 55, 55, 80, 130, 30, 85, 100, 90, 50, 70, 80, 130, 100, 45, 60, 80, 65, 80, 100, 43, 58, 78, 20, 90, 50, 70, 55, 85, 30, 40, 130, 67, 67, 60, 15, 15, 20, 40, 70, 95, 35, 45, 55, 50, 40, 50, 30, 70, 50, 80, 110, 85, 30, 30, 95, 15, 35, 110, 65, 91, 30, 85, 48, 33, 85, 15, 40, 45, 85, 30, 30, 45, 85, 65, 5, 85, 115, 40, 55, 20, 30, 50, 50, 35, 65, 45, 75, 70, 70, 65, 95, 85, 40, 50, 60, 85, 75, 35, 70, 65, 95, 83, 100, 55, 115, 100, 85, 41, 51, 61, 110, 90, 100, 70, 95, 120, 45, 55, 80, 40, 50, 60, 35, 70, 60, 100, 20, 15, 65, 15, 65, 30, 50, 70, 30, 60, 80, 85, 125, 85, 65, 40, 50, 80, 65, 80, 35, 70, 30, 90, 100, 40, 160, 40, 28, 48, 68, 25, 50, 20, 30, 50, 90, 50, 50, 30, 40, 50, 60, 80, 65, 105, 95, 95, 85, 85, 65, 40, 55, 65, 95, 60, 60, 35, 40, 20, 60, 80, 60, 10, 70, 100, 35, 55, 50, 80, 90, 65, 70, 70, 60, 60, 35, 55, 55, 75, 23, 43, 75, 45, 80, 81, 70, 40, 45, 65, 25, 25, 51, 65, 75, 23, 50, 80, 25, 45, 65, 32, 52, 52, 55, 97, 50, 50, 100, 30, 50, 70, 50, 50, 50, 110, 110, 90, 90, 95, 100, 150, 31, 36, 56, 61, 81, 108, 40, 50, 60, 60, 80, 100, 31, 71, 25, 65, 45, 60, 70, 55, 90, 58, 58, 30, 30, 36, 36, 66, 70, 40, 95, 85, 115, 35, 85, 34, 39, 115, 70, 80, 85, 105, 105, 71, 85, 112, 45, 74, 84, 23, 33, 10, 60, 30, 91, 35, 42, 82, 102, 5, 60, 90, 32, 47, 65, 95, 50, 85, 46, 66, 91, 50, 40, 60, 125, 60, 50, 40, 50, 95, 83, 80, 95, 95, 65, 95, 80, 90, 80, 40, 45, 110, 91, 95, 80, 115, 90, 100, 77, 100, 90, 85, 80, 100, 125, 100, 120, 100, 63, 83, 113, 45, 55, 65, 45, 60, 70, 42, 77, 55, 60, 80, 66, 106, 64, 101, 64, 101, 64, 101, 24, 29, 43, 65, 93, 76, 116, 15, 20, 25, 72, 114, 68, 88, 50, 35, 40, 45, 64, 69, 74, 45, 85, 42, 42, 92, 57, 47, 112, 66, 116, 30, 90, 98, 65, 74, 92, 50, 95, 60, 55, 45, 48, 58, 97, 30, 30, 22, 32, 70, 110, 65, 75, 65, 105, 75, 115, 45, 55, 65, 20, 30, 30, 55, 98, 44, 59, 79, 75, 95, 103, 60, 20, 15, 30, 40, 60, 65)
speed <- c(speed, 65, 108, 10, 20, 30, 50, 90, 60, 40, 50, 30, 40, 20, 55, 80, 57, 67, 97, 40, 50, 105, 25, 145, 32, 65, 105, 48, 35, 55, 60, 70, 55, 60, 80, 60, 80, 65, 109, 38, 58, 98, 60, 100, 108, 108, 108, 111, 111, 90, 90, 101, 95, 108, 90, 99, 38, 57, 64, 60, 73, 104, 71, 97, 122, 57, 78, 62, 84, 126, 35, 29, 89, 72, 106, 42, 52, 75, 52, 68, 43, 58, 102, 68, 104, 28, 35, 60, 23, 29, 49, 72, 45, 73, 50, 68, 30, 44, 44, 59, 70, 109, 48, 71, 46, 58, 60, 118, 101, 50, 40, 60, 80, 75, 38, 56, 51, 84, 28, 28, 55, 123, 99, 99, 95, 50, 70, 70, 42, 52, 70, 70, 90, 60, 40, 50, 60, 65, 75, 60, 45, 45, 46, 36, 43, 63, 43, 93, 84, 124, 60, 112, 40, 45, 35, 45, 35, 27, 42, 35, 45, 15, 30, 77, 117, 50, 60, 32, 62, 72, 100, 60, 80, 80, 40, 15, 35, 5, 59, 95, 60, 65, 36, 96, 96, 92, 36, 40, 45, 65, 85, 130, 95, 75, 85, 37, 37, 97, 97, 103, 79, 151, 83, 61, 109, 43, 79, 65, 125, 80, 100, 100, 78, 145, 121, 72, 77, 77, 90, 90, 90, 90, 90, 90, 110, 40, 65, 65, 109, 90, 110, 90, 115, 150, 20, 35, 45, 30, 25, 50, 130, 45, 45, 100, 105, 81, 150, 130, 140, 45, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 30, 75, 75, 115, 71, 145, 100, 70, 100, 20, 50, 50, 100, 135, 105, 20, 80, 70, 70, 70, 75, 115, 100, 120, 110, 110, 110, 90, 90, 115, 150, 90, 180, 36, 36, 36, 36, 85, 34, 39, 135, 92, 112, 30, 110, 86, 86, 86, 86, 86, 90, 127, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 50, 98, 55, 75, 75, 75, 95, 95, 95, 121, 101, 91, 95, 95, 108, 128, 99, 99, 99, 99, 122, 132, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 42, 42, 42, 42, 52, 52, 52, 52, 92, 75, 75, 75, 75, 102, 102, 102, 102, 102, 102, 102, 102, 102, 104, 60, 56, 46, 41, 99, 69, 54, 99, 115, 115, 95, 85, 110, 80, 45, 43, 93, 93, 93, 82, 30, 45, 117, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 60, 60, 60, 60, 60, 60, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 85, 65)
attack <- c(49, 62, 82, 52, 64, 84, 48, 63, 83, 30, 20, 45, 35, 25, 90, 45, 60, 80, 56, 81, 60, 90, 60, 95, 55, 90, 75, 100, 47, 62, 92, 57, 72, 102, 45, 70, 41, 76, 45, 70, 45, 80, 50, 65, 80, 70, 95, 55, 65, 55, 100, 45, 70, 52, 82, 80, 105, 70, 110, 50, 65, 95, 20, 35, 50, 80, 100, 130, 75, 90, 105, 40, 70, 80, 95, 120, 85, 100, 65, 75, 35, 60, 90, 85, 110, 45, 70, 80, 105, 65, 95, 35, 50, 65, 45, 48, 73, 105, 130, 30, 50, 40, 95, 50, 80, 120, 105, 55, 65, 90, 85, 130, 5, 55, 95, 40, 65, 67, 92, 45, 75, 45, 110, 50, 83, 95, 125, 100, 10, 125, 85, 48, 55, 65, 65, 130, 60, 40, 60, 80, 115, 105, 110, 85, 90, 100, 64, 84, 134, 110, 100, 49, 62, 82, 52, 64, 84, 65, 80, 105, 46, 76, 30, 50, 20, 35, 60, 90, 90, 38, 58, 40, 25, 30, 20, 40, 50, 75, 40, 55, 75, 80, 20, 50, 100, 75, 35, 45, 55, 70, 30, 75, 65, 45, 85, 65, 65, 85, 75, 60, 72, 33, 80, 65, 90, 70, 75, 85, 80, 120, 95, 130, 10, 125, 95, 80, 130, 40, 50, 50, 100, 55, 65, 105, 55, 40, 80, 60, 90, 95, 60, 120, 80, 95, 20, 35, 95, 30, 63, 75, 80, 10, 85, 115, 75, 64, 84, 134, 90, 130, 100, 45, 65, 85, 60, 85, 120, 70, 85, 110, 55, 90, 30, 70, 45, 35, 70, 35, 50, 30, 50, 70, 40, 70, 100, 55, 85, 30, 50, 25, 35, 65, 30, 60, 40, 130, 60, 80, 160, 45, 90, 90, 51, 71, 91, 60, 120, 20, 45, 45, 65, 75, 85, 70, 90, 110, 40, 60, 45, 75, 50, 40, 73, 47, 60, 43, 73, 90, 120, 70, 90, 60, 100, 85, 25, 45, 60, 100, 70, 100, 85, 115, 40, 70, 115, 100, 55, 95, 48, 78, 80, 120, 40, 70, 41, 81, 95, 125, 15, 60, 70, 90, 75, 115, 40, 70, 68, 50, 130, 23, 50, 80, 40, 60, 80, 64, 104, 84, 90, 30, 75, 95, 135, 55, 75, 135, 100, 50, 75, 80, 90, 100, 150, 150, 100, 150, 68, 89, 109, 58, 78, 104, 51, 66, 86, 55, 75, 120, 45, 85, 25, 85, 65, 85, 120, 30, 70, 125, 165, 42, 52, 29, 59, 94, 30, 80, 45, 65, 105, 35, 60, 48, 83, 100, 50, 80, 66, 76, 60, 125, 55, 82, 30, 63, 93, 24, 89, 80, 25, 5, 65, 92, 70, 90, 130, 85, 70, 110, 72, 112, 50, 90, 61, 106, 100, 49, 69, 20, 62, 92, 120, 70, 85, 140, 100, 123, 95, 50, 76, 110, 60, 95, 130, 80, 125, 55, 100, 80, 50, 75, 105, 125, 120, 120, 90, 160, 100, 70, 80, 100, 90, 100, 120, 100, 45, 60, 75, 63, 93, 123, 55, 75, 100, 55, 85, 60, 80, 110, 50, 88, 53, 98, 53, 98, 53, 98, 25, 55, 55, 77, 115, 60, 100, 75, 105, 135, 45, 57, 85, 135, 60, 80, 105, 140, 50, 65, 95, 100, 125, 53, 63, 103, 45, 55, 100, 27, 67, 35, 60, 92, 72, 82, 117, 90, 140, 86, 65, 105, 75, 90, 58, 30, 50, 78, 108, 112, 140, 50, 95, 65, 105, 50, 95, 30, 45, 55, 30, 40, 65, 44, 87, 50, 65, 95, 60, 100, 75, 75, 135, 55, 85, 40, 60, 75, 47, 77, 50, 94, 55, 80, 100, 55, 85, 115, 55, 75, 30, 40, 55, 87, 117)
attack <- c(attack, 147, 70, 130, 50, 40, 70, 66, 85, 125, 120, 74, 124, 85, 125, 110, 83, 123, 55, 65, 97, 109, 65, 85, 105, 85, 60, 90, 129, 90, 115, 115, 120, 150, 125, 130, 72, 77, 120, 61, 78, 107, 45, 59, 69, 56, 63, 95, 36, 56, 50, 73, 81, 35, 22, 52, 50, 68, 38, 45, 65, 65, 100, 82, 124, 80, 48, 48, 80, 110, 50, 52, 72, 48, 80, 54, 92, 52, 105, 60, 75, 53, 73, 38, 55, 89, 121, 59, 77, 65, 92, 58, 50, 50, 75, 100, 80, 70, 110, 66, 90, 69, 117, 30, 70, 131, 131, 100, 100, 110, 110, 55, 75, 107, 65, 85, 115, 54, 69, 74, 75, 85, 120, 70, 110, 62, 82, 70, 82, 132, 70, 45, 55, 65, 115, 20, 53, 63, 100, 125, 40, 70, 55, 105, 35, 45, 44, 64, 75, 125, 30, 40, 120, 52, 60, 120, 35, 125, 55, 75, 60, 95, 95, 60, 115, 78, 98, 90, 105, 60, 131, 55, 75, 110, 115, 85, 130, 75, 29, 29, 137, 113, 53, 139, 137, 89, 101, 181, 101, 107, 95, 125, 100, 130, 104, 103, 150, 80, 56, 71, 71, 55, 55, 55, 55, 55, 55, 85, 75, 100, 41, 67, 55, 100, 35, 60, 50, 80, 95, 120, 75, 80, 105, 65, 105, 80, 125, 155, 155, 135, 190, 150, 95, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 125, 150, 185, 90, 164, 110, 160, 150, 85, 85, 105, 140, 100, 75, 140, 120, 110, 70, 70, 70, 165, 150, 120, 145, 145, 100, 130, 150, 180, 180, 180, 70, 95, 29, 29, 79, 69, 60, 48, 83, 136, 170, 145, 132, 165, 65, 65, 65, 65, 65, 120, 103, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 60, 92, 30, 60, 60, 60, 100, 100, 100, 100, 105, 145, 120, 170, 72, 128, 120, 120, 120, 120, 95, 145, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 38, 38, 38, 38, 45, 45, 45, 45, 65, 65, 65, 65, 65, 80, 80, 80, 80, 80, 80, 80, 80, 80, 48, 150, 66, 66, 66, 85, 95, 100, 131, 100, 100, 100, 100, 160, 160, 110, 70, 70, 70, 70, 115, 140, 105, 64, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 60, 60, 60, 60, 60, 60, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 110, 95)
hp <- c(45, 60, 80, 39, 58, 78, 44, 59, 79, 45, 50, 60, 40, 45, 65, 40, 63, 83, 30, 55, 40, 65, 35, 60, 35, 60, 50, 75, 55, 70, 90, 46, 61, 81, 70, 95, 38, 73, 115, 140, 40, 75, 45, 60, 75, 35, 60, 60, 70, 10, 35, 40, 65, 50, 80, 40, 65, 55, 90, 40, 65, 90, 25, 40, 55, 70, 80, 90, 50, 65, 80, 40, 80, 40, 55, 80, 50, 65, 90, 95, 25, 50, 52, 35, 60, 65, 90, 80, 105, 30, 50, 30, 45, 60, 35, 60, 85, 30, 55, 40, 60, 60, 95, 50, 60, 50, 50, 90, 40, 65, 80, 105, 250, 65, 105, 30, 55, 45, 80, 30, 60, 40, 70, 65, 65, 65, 65, 75, 20, 95, 130, 48, 55, 130, 65, 65, 65, 35, 70, 30, 60, 80, 160, 90, 90, 90, 41, 61, 91, 106, 100, 45, 60, 80, 39, 58, 78, 50, 65, 85, 35, 85, 60, 100, 40, 55, 40, 70, 85, 75, 125, 20, 50, 90, 35, 55, 40, 65, 55, 70, 90, 75, 70, 100, 70, 90, 35, 55, 75, 55, 30, 75, 65, 55, 95, 65, 95, 60, 95, 60, 48, 190, 70, 50, 75, 100, 65, 75, 60, 90, 65, 70, 20, 80, 55, 60, 90, 40, 60, 50, 100, 65, 35, 75, 45, 85, 65, 45, 75, 75, 90, 90, 85, 73, 55, 35, 50, 45, 45, 45, 95, 255, 90, 115, 100, 50, 70, 100, 106, 106, 100, 40, 50, 70, 45, 60, 80, 50, 70, 100, 35, 70, 38, 78, 45, 50, 60, 50, 60, 40, 60, 80, 40, 70, 90, 40, 60, 40, 60, 28, 38, 68, 40, 70, 60, 60, 60, 80, 150, 31, 61, 1, 64, 84, 104, 72, 144, 50, 30, 50, 70, 50, 50, 50, 60, 70, 30, 60, 40, 70, 60, 60, 65, 65, 50, 70, 100, 45, 70, 130, 170, 60, 70, 70, 60, 80, 60, 45, 50, 80, 50, 70, 45, 75, 73, 73, 90, 90, 50, 110, 43, 63, 40, 60, 66, 86, 45, 75, 20, 95, 70, 60, 44, 64, 20, 40, 99, 75, 65, 95, 50, 80, 70, 90, 110, 35, 55, 55, 100, 43, 45, 65, 95, 40, 60, 80, 80, 80, 80, 80, 80, 100, 100, 105, 100, 50, 55, 75, 95, 44, 64, 76, 53, 64, 84, 40, 55, 85, 59, 79, 37, 77, 45, 60, 80, 40, 60, 67, 97, 30, 60, 40, 60, 70, 30, 70, 60, 55, 85, 45, 70, 76, 111, 75, 90, 150, 55, 65, 60, 100, 49, 71, 45, 63, 103, 57, 67, 50, 20, 100, 76, 50, 58, 68, 108, 135, 40, 70, 68, 108, 40, 70, 48, 83, 74, 49, 69, 45, 60, 90, 70, 70, 110, 115, 100, 75, 75, 85, 86, 65, 65, 75, 110, 85, 68, 60, 45, 70, 50, 75, 80, 75, 100, 90, 91, 110, 150, 120, 80, 100, 70, 100, 120, 100, 45, 60, 75, 65, 90, 110, 55, 75, 95, 45, 60, 45, 65, 85, 41, 64, 50, 75, 50, 75, 50, 75, 76, 116, 50, 62, 80, 45, 75, 55, 70, 85, 65, 67, 60, 110, 103, 75, 85, 105, 50, 75, 105, 120, 75, 45, 55, 75, 30, 40, 60, 40, 60, 45, 70, 70, 50, 60, 95, 70, 105, 75, 50, 70, 50, 65, 72, 38, 58, 54, 74, 55, 75, 50, 80, 40, 60, 55, 75, 45, 60)
hp <- c(hp, 70, 45, 65, 110, 62, 75, 36, 51, 71, 60, 80, 55, 50, 70, 69, 114, 55, 100, 165, 50, 70, 44, 74, 40, 60, 60, 35, 65, 85, 55, 75, 50, 60, 60, 46, 66, 76, 55, 95, 80, 50, 80, 109, 45, 65, 77, 59, 89, 45, 65, 95, 70, 100, 70, 110, 85, 58, 52, 72, 92, 55, 85, 91, 91, 91, 79, 79, 100, 100, 89, 125, 91, 100, 71, 56, 61, 88, 40, 59, 75, 41, 54, 72, 38, 85, 45, 62, 78, 38, 45, 80, 62, 86, 44, 54, 78, 66, 123, 67, 95, 75, 62, 74, 45, 59, 60, 78, 101, 62, 82, 53, 86, 42, 72, 50, 65, 50, 71, 44, 62, 58, 82, 77, 123, 95, 78, 67, 50, 45, 68, 90, 57, 43, 85, 49, 65, 55, 95, 40, 85, 126, 126, 108, 50, 80, 80, 68, 78, 78, 45, 65, 95, 50, 60, 80, 35, 55, 80, 48, 88, 47, 57, 77, 47, 97, 75, 40, 60, 45, 75, 45, 50, 50, 70, 100, 38, 68, 40, 70, 40, 60, 48, 68, 70, 120, 42, 52, 72, 51, 90, 100, 25, 75, 55, 85, 55, 95, 95, 60, 65, 60, 65, 55, 68, 78, 70, 45, 55, 75, 70, 70, 70, 70, 43, 43, 137, 137, 109, 107, 71, 83, 97, 59, 223, 97, 80, 90, 80, 78, 78, 79, 65, 83, 30, 75, 75, 35, 35, 35, 35, 35, 35, 60, 50, 75, 38, 73, 10, 35, 40, 65, 55, 40, 55, 80, 95, 80, 105, 60, 95, 60, 105, 65, 95, 80, 106, 106, 90, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 75, 70, 80, 75, 100, 70, 80, 100, 68, 50, 50, 70, 60, 70, 70, 70, 75, 70, 70, 70, 64, 65, 80, 95, 80, 80, 80, 100, 100, 105, 50, 50, 50, 40, 40, 60, 60, 70, 76, 111, 65, 108, 70, 90, 68, 50, 50, 50, 50, 50, 150, 100, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 103, 70, 105, 60, 60, 60, 80, 80, 80, 79, 79, 89, 125, 125, 91, 100, 71, 71, 71, 71, 72, 72, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 44, 44, 44, 44, 54, 54, 54, 54, 74, 78, 78, 78, 78, 75, 75, 75, 75, 75, 75, 75, 75, 75, 74, 60, 44, 54, 59, 55, 75, 85, 126, 54, 54, 108, 216, 50, 80, 88, 77, 75, 75, 75, 85, 45, 70, 68, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 75, 80)
spatk <- c(65, 80, 100, 60, 80, 109, 50, 65, 85, 20, 25, 90, 20, 25, 45, 35, 50, 70, 25, 50, 31, 61, 40, 65, 50, 90, 20, 45, 40, 55, 75, 40, 55, 85, 60, 95, 50, 81, 45, 85, 30, 65, 75, 85, 110, 45, 60, 40, 90, 35, 50, 40, 65, 65, 95, 35, 60, 70, 100, 40, 50, 70, 105, 120, 135, 35, 50, 65, 70, 85, 100, 50, 80, 30, 45, 55, 65, 80, 40, 100, 95, 120, 58, 35, 60, 45, 70, 40, 65, 45, 85, 100, 115, 130, 30, 43, 73, 25, 50, 55, 80, 60, 125, 40, 50, 35, 35, 60, 60, 85, 30, 45, 35, 100, 40, 70, 95, 35, 65, 70, 100, 100, 55, 115, 95, 100, 55, 40, 15, 60, 85, 48, 45, 110, 110, 95, 85, 90, 115, 55, 65, 60, 65, 95, 125, 125, 50, 70, 100, 154, 100, 49, 63, 83, 60, 80, 109, 44, 59, 79, 35, 45, 36, 86, 40, 55, 40, 60, 70, 56, 76, 35, 45, 40, 40, 80, 70, 95, 65, 80, 115, 90, 20, 60, 30, 90, 35, 45, 55, 40, 30, 105, 75, 25, 65, 130, 60, 85, 100, 85, 72, 33, 90, 35, 60, 65, 35, 55, 40, 60, 55, 55, 10, 40, 35, 50, 75, 70, 90, 30, 60, 65, 65, 105, 65, 80, 40, 80, 110, 95, 40, 60, 105, 85, 20, 35, 35, 85, 65, 70, 40, 75, 115, 90, 90, 45, 65, 95, 90, 110, 100, 65, 85, 105, 70, 85, 110, 50, 60, 85, 30, 60, 30, 50, 20, 25, 100, 25, 50, 40, 60, 90, 30, 60, 90, 30, 75, 55, 95, 45, 65, 125, 50, 100, 40, 60, 35, 55, 95, 30, 50, 30, 51, 71, 91, 20, 40, 20, 45, 35, 55, 65, 55, 40, 50, 60, 40, 60, 65, 105, 85, 75, 47, 73, 100, 43, 73, 65, 95, 70, 90, 65, 105, 85, 70, 90, 60, 45, 50, 80, 85, 115, 40, 70, 60, 100, 95, 55, 46, 76, 50, 90, 40, 70, 61, 81, 40, 70, 10, 100, 70, 60, 63, 83, 30, 60, 72, 95, 75, 23, 50, 80, 55, 75, 95, 74, 94, 114, 45, 40, 40, 60, 110, 35, 55, 95, 50, 100, 75, 110, 130, 150, 100, 150, 100, 150, 45, 55, 75, 58, 78, 104, 61, 81, 111, 30, 40, 50, 35, 55, 25, 55, 40, 60, 95, 50, 125, 30, 65, 42, 47, 29, 79, 94, 30, 80, 45, 60, 85, 62, 87, 57, 92, 60, 60, 90, 44, 54, 105, 105, 42, 64, 65, 41, 71, 24, 79, 10, 70, 15, 92, 92, 40, 50, 80, 40, 35, 115, 38, 68, 30, 60, 61, 86, 90, 49, 69, 60, 62, 92, 45, 130, 80, 55, 110, 95, 125, 120, 116, 60, 130, 45, 70, 135, 65, 75, 65, 80, 95, 75, 105, 125, 150, 150, 130, 80, 100, 75, 80, 100, 135, 100, 120, 100, 45, 60, 75, 45, 70, 100, 63, 83, 108, 35, 60, 25, 35, 45, 50, 88, 53, 98, 53, 98, 53, 98, 67, 107, 36, 50, 65, 50, 80, 25, 50, 60, 55, 77, 30, 50, 60, 25, 40, 55, 50, 65, 85, 30, 30, 40, 50, 70, 30, 40, 55, 37, 77, 70, 110, 80, 35, 45, 65, 15, 30, 106, 35, 65, 35, 45, 103, 55, 95, 53, 83, 74, 112, 40, 60, 80, 120, 40, 65, 55, 75, 95, 105, 125, 125, 44, 87, 65, 80, 110, 40, 60, 75, 40, 60, 55, 85, 65, 85, 40, 57, 97, 24, 54, 45, 70, 70, 45, 75, 105, 85, 125, 65, 95, 145, 30, 40, 60, 60, 70, 95, 40)
spatk <- c(spatk, 100, 81, 55, 95, 60, 35, 55, 40, 60, 40, 37, 57, 45, 55, 105, 48, 45, 65, 125, 50, 135, 90, 72, 90, 125, 125, 150, 120, 115, 130, 129, 128, 120, 48, 56, 74, 62, 90, 114, 62, 83, 103, 32, 50, 40, 56, 74, 27, 27, 90, 73, 109, 61, 75, 112, 62, 97, 46, 69, 65, 63, 83, 35, 45, 50, 63, 99, 59, 85, 37, 68, 39, 54, 60, 97, 58, 120, 61, 109, 45, 69, 67, 99, 110, 74, 81, 50, 55, 83, 110, 80, 50, 65, 44, 58, 32, 44, 45, 97, 131, 131, 81, 100, 150, 130, 50, 70, 100, 60, 80, 80, 66, 91, 126, 30, 40, 75, 30, 55, 55, 55, 145, 42, 62, 98, 55, 95, 30, 55, 25, 43, 53, 45, 55, 40, 50, 50, 80, 65, 90, 71, 111, 45, 55, 30, 40, 50, 82, 90, 40, 20, 60, 70, 100, 30, 95, 95, 60, 75, 91, 40, 50, 70, 135, 86, 45, 65, 100, 95, 130, 85, 95, 29, 29, 113, 137, 127, 53, 137, 173, 107, 59, 97, 127, 130, 90, 122, 130, 159, 135, 15, 135, 25, 40, 40, 50, 50, 50, 50, 50, 50, 95, 10, 25, 50, 81, 35, 50, 50, 75, 175, 30, 45, 55, 130, 40, 65, 170, 125, 50, 60, 65, 70, 70, 154, 194, 165, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 55, 65, 40, 140, 95, 145, 130, 95, 165, 85, 55, 60, 80, 135, 110, 145, 110, 70, 70, 70, 93, 115, 120, 120, 105, 140, 160, 180, 150, 180, 180, 70, 95, 29, 29, 59, 69, 87, 57, 92, 54, 120, 140, 132, 65, 105, 105, 105, 105, 105, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 80, 80, 140, 40, 40, 40, 60, 60, 60, 110, 145, 105, 170, 120, 129, 77, 120, 120, 120, 120, 103, 153, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 61, 61, 61, 61, 75, 75, 75, 75, 125, 112, 112, 112, 112, 65, 65, 65, 65, 65, 65, 65, 65, 65, 83, 150, 44, 44, 44, 58, 58, 58, 131, 61, 61, 81, 91, 160, 170, 55, 145, 98, 98, 98, 55, 140, 80, 111, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 60, 60, 60, 60, 60, 60, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 130)
spdef <- c(65, 80, 100, 50, 65, 85, 64, 80, 105, 20, 25, 80, 20, 25, 80, 35, 50, 70, 35, 70, 31, 61, 54, 79, 50, 80, 30, 55, 40, 55, 85, 40, 55, 75, 65, 90, 65, 100, 25, 50, 40, 75, 65, 75, 90, 55, 80, 55, 75, 45, 70, 40, 65, 50, 80, 45, 70, 50, 80, 40, 50, 90, 55, 70, 95, 35, 60, 85, 30, 45, 70, 100, 120, 30, 45, 65, 65, 80, 40, 80, 55, 70, 62, 35, 60, 70, 95, 50, 100, 25, 45, 35, 55, 75, 45, 90, 115, 25, 50, 55, 80, 45, 75, 50, 80, 110, 110, 75, 45, 70, 30, 45, 105, 40, 80, 25, 45, 50, 80, 55, 85, 120, 80, 95, 85, 85, 70, 70, 20, 100, 95, 48, 65, 95, 95, 110, 75, 55, 70, 45, 70, 75, 110, 125, 90, 85, 50, 70, 100, 90, 100, 65, 80, 100, 50, 65, 85, 48, 63, 83, 45, 55, 56, 96, 80, 110, 40, 70, 80, 56, 76, 35, 55, 20, 65, 105, 45, 70, 45, 60, 90, 100, 50, 80, 65, 100, 55, 65, 95, 55, 30, 85, 45, 25, 65, 95, 130, 42, 110, 85, 48, 58, 65, 35, 60, 65, 65, 65, 40, 60, 55, 80, 230, 95, 75, 50, 75, 40, 80, 30, 60, 95, 35, 75, 45, 140, 70, 50, 80, 95, 40, 60, 95, 65, 45, 35, 110, 65, 55, 55, 70, 135, 100, 75, 115, 50, 70, 100, 154, 154, 100, 55, 65, 85, 50, 60, 70, 50, 70, 90, 30, 60, 41, 61, 30, 25, 50, 25, 90, 50, 70, 100, 30, 40, 60, 30, 50, 30, 70, 35, 55, 115, 52, 82, 60, 60, 35, 55, 65, 30, 50, 30, 23, 43, 73, 30, 60, 40, 90, 35, 55, 65, 55, 40, 50, 60, 55, 75, 40, 60, 75, 85, 85, 85, 80, 53, 83, 20, 40, 35, 45, 45, 75, 70, 80, 110, 60, 45, 50, 80, 40, 60, 75, 105, 60, 60, 85, 65, 41, 71, 35, 55, 70, 120, 87, 107, 50, 80, 55, 125, 70, 120, 33, 63, 90, 130, 87, 90, 60, 48, 50, 80, 50, 70, 90, 55, 75, 75, 65, 65, 30, 50, 80, 60, 80, 90, 100, 200, 150, 130, 110, 140, 90, 90, 100, 50, 55, 65, 85, 44, 52, 71, 56, 76, 101, 30, 40, 60, 40, 60, 41, 51, 34, 49, 79, 70, 105, 30, 50, 88, 138, 45, 105, 50, 42, 102, 90, 30, 50, 53, 78, 62, 82, 66, 44, 54, 56, 96, 105, 52, 37, 59, 50, 41, 61, 86, 116, 45, 90, 65, 42, 108, 45, 55, 85, 85, 40, 70, 42, 72, 55, 75, 40, 65, 72, 61, 86, 120, 60, 85, 85, 90, 95, 55, 50, 85, 95, 115, 56, 65, 95, 75, 60, 75, 115, 150, 135, 70, 77, 130, 105, 70, 100, 120, 106, 110, 120, 130, 80, 100, 90, 100, 120, 100, 55, 75, 95, 45, 55, 65, 45, 60, 70, 39, 69, 45, 65, 90, 37, 50, 48, 63, 48, 63, 48, 63, 55, 95, 30, 42, 55, 32, 63, 25, 40, 80, 43, 55, 45, 65, 86, 35, 50, 65, 40, 55, 75, 85, 75, 60, 80, 80, 39, 79, 69, 50, 75, 50, 75, 55, 35, 45, 70, 45, 55, 67, 35, 75, 70, 115, 80, 65, 105, 45, 65, 45, 65, 62, 82, 40, 60, 40, 60, 65, 85, 110, 50, 60, 85, 50, 63, 60, 75, 95, 50, 70, 60, 45, 105, 55, 80, 85, 105, 45, 50, 60, 86, 116, 60, 85, 85, 40, 70, 80, 55, 95, 55, 60, 90, 40, 50, 70, 40, 80, 135, 65, 60, 99, 50, 60)
spdef <- c(spdef, 90, 50, 80, 40, 70, 95, 50, 75, 65, 95, 66, 48, 50, 70, 90, 55, 105, 72, 90, 129, 80, 80, 120, 100, 80, 90, 90, 128, 95, 45, 58, 75, 60, 70, 100, 44, 56, 71, 36, 77, 38, 52, 69, 25, 30, 50, 54, 66, 79, 98, 154, 57, 81, 48, 71, 90, 60, 81, 37, 49, 150, 65, 89, 57, 75, 46, 75, 56, 86, 60, 123, 63, 89, 43, 94, 45, 59, 63, 92, 130, 63, 67, 150, 75, 113, 150, 87, 60, 82, 55, 75, 35, 46, 40, 80, 98, 98, 95, 150, 130, 90, 50, 70, 100, 40, 50, 90, 56, 81, 116, 30, 50, 75, 30, 60, 45, 75, 75, 47, 67, 70, 40, 70, 40, 65, 25, 52, 142, 55, 85, 72, 132, 35, 90, 75, 100, 40, 60, 50, 60, 38, 48, 98, 110, 110, 60, 30, 90, 45, 75, 130, 95, 95, 100, 95, 85, 73, 105, 70, 91, 90, 45, 70, 105, 75, 115, 95, 130, 31, 131, 89, 107, 131, 53, 37, 71, 101, 31, 53, 89, 115, 90, 120, 85, 115, 115, 80, 80, 35, 80, 80, 50, 50, 50, 50, 50, 50, 85, 35, 65, 65, 100, 45, 70, 40, 65, 95, 30, 45, 65, 80, 50, 100, 95, 75, 80, 100, 90, 130, 95, 100, 120, 110, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 95, 100, 105, 90, 120, 85, 80, 110, 135, 115, 95, 80, 85, 80, 65, 105, 105, 70, 70, 70, 83, 60, 80, 90, 110, 150, 120, 160, 90, 100, 20, 160, 90, 45, 45, 85, 95, 78, 62, 82, 96, 95, 70, 105, 115, 107, 107, 107, 107, 107, 100, 75, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 126, 55, 105, 50, 50, 50, 70, 70, 70, 90, 80, 80, 100, 90, 90, 77, 95, 95, 95, 95, 71, 71, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 79, 79, 79, 79, 98, 98, 98, 98, 128, 154, 154, 154, 154, 90, 90, 90, 90, 90, 90, 90, 90, 90, 81, 50, 55, 55, 55, 75, 75, 75, 98, 85, 85, 95, 95, 110, 130, 60, 75, 70, 70, 70, 75, 135, 90, 60, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 100, 100, 100, 100, 100, 100, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 105, 115)
poke_df <- data.frame(hp, attack, defense, spatk, spdef, speed)
glimpse(poke_df)
## Observations: 1,061
## Variables: 6
## $ hp <dbl> 45, 60, 80, 39, 58, 78, 44, 59, 79, 45, 50, 60, 40, 45, 65,...
## $ attack <dbl> 49, 62, 82, 52, 64, 84, 48, 63, 83, 30, 20, 45, 35, 25, 90,...
## $ defense <dbl> 49, 63, 83, 43, 58, 78, 65, 80, 100, 35, 55, 50, 30, 50, 40...
## $ spatk <dbl> 65, 80, 100, 60, 80, 109, 50, 65, 85, 20, 25, 90, 20, 25, 4...
## $ spdef <dbl> 65, 80, 100, 50, 65, 85, 64, 80, 105, 20, 25, 80, 20, 25, 8...
## $ speed <dbl> 45, 60, 80, 65, 80, 100, 43, 58, 78, 45, 30, 70, 50, 35, 75...
library(caret)
# Select the variables
poke_vars <- poke_df %>%
select(defense, speed)
# Perform a Box-Cox transformation
processed_vars <- preProcess(poke_vars, method = c("BoxCox"))
# Use predict to transform data
poke_df <- predict(processed_vars, poke_df)
# Plot transformed features
ggplot(poke_df, aes(x=defense)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(poke_df, aes(x=speed)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
duration <- c(261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517, 71, 174, 353, 98, 38, 219, 54, 262, 164, 160, 342, 181, 172, 296, 127, 255, 348, 225, 230, 208, 226, 336, 242, 365, 1666, 577, 137, 160, 180, 22, 1492, 616, 242, 355, 225, 160, 363, 266, 253, 179, 787, 145, 174, 104, 13, 185, 1778, 138, 812, 164, 391, 357, 91, 528, 273, 158, 177, 258, 172, 154, 291, 181, 176, 211, 349, 272, 208, 193, 212, 20, 1042, 246, 529, 1467, 1389, 188, 180, 48, 213, 583, 221, 173, 426, 287, 101, 203, 197, 257, 124, 229, 55, 400, 197, 190, 21, 514, 849, 194, 144, 212, 286, 107, 247, 518, 364, 178, 98, 439, 79, 120, 127, 175, 262, 61, 78, 143, 579, 677, 345, 185, 100, 125, 193, 136, 73, 528, 541, 163, 301, 46, 204, 98, 71, 157, 243, 186, 579, 163, 610, 2033, 85, 114, 114, 57, 238, 93, 128, 107, 181, 303, 558, 270, 228, 99, 240, 673, 233, 1056, 250, 252, 138, 130, 412, 179, 19, 458, 717, 313, 683, 1077, 416, 146, 167, 315, 140, 346, 562, 172, 217, 142, 67, 291, 309, 248, 98, 256, 82, 577, 286, 477, 611, 471, 381, 42, 251, 408, 215, 287, 216, 366, 210, 288, 168, 338, 410, 177, 127, 357, 175, 300, 136, 1419, 125, 213, 27, 238, 124, 18, 730, 746, 121, 247, 40, 181, 79, 206, 389, 127, 702, 151, 117, 232, 408, 179, 39, 282, 714)
balance <- c(2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 71, 162, 229, 13, 52, 60, 0, 723, 779, 23, 50, 0, -372, 255, 113, -246, 265, 839, 378, 39, 0, 10635, 63, -7, -3, 506, 0, 2586, 49, 104, 529, 96, -171, -364, 0, 0, 0, 1291, -244, 0, -76, -103, 243, 424, 306, 24, 179, 0, 989, 249, 790, 154, 6530, 100, 59, 1205, 12223, 5935, 25, 282, 23, 1937, 384, 582, 91, 0, 1, 206, 164, 690, 2343, 137, 173, 45, 1270, 16, 486, 50, 152, 290, 54, -37, 101, 383, 81, 0, 229, -674, 90, 128, 179, 0, 54, 151, 61, 30, 523, 31, 79, -34, 448, 81, 144, 351, -67, 262, 0, 56, 26, 3, 41, 7, 105, 818, -16, 0, 2476, 1185, 217, 1685, 802, 0, 94, 0, 0, 517, 265, 947, 3, 42, 37, 57, 22, 8, 293, 3, 348, -19, 0, -4, 18, 139, 0, 1883, 216, 782, 904, 1705, 47, 176, 1225, 86, 82, 271, 1378, 184, 0, 0, 1357, 19, 434, 92, 1151, 41, 51, 214, 1161, 37, 787, 59, 253, 211, 235, 4384, 4080, 53, 0, 2127, 377, 73, 445, 243, 307, 155, 173, 400, 1428, 219, 7, 575, 298, 0, 5699, 176, 517, 257, 56, -390, 330, 195, 301, -41, 483, 28, 13, 965, 378, 219, 324, -69, 0, 205, 278, 1065, 34, 1033, 1467, -12, 388, 294, 1827, 627, 25, 315, 0, 66, -9, 349, 100, 0, 434, 3237, 275, 0, 207, 483, 2248)
bank_df <- data.frame(balance, duration)
glimpse(bank_df)
## Observations: 250
## Variables: 2
## $ balance <dbl> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, ...
## $ duration <dbl> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 51...
# Select both variables
bank_vars <- bank_df %>%
select(balance, duration)
# Perform a Yeo-Johnson transformation
processed_vars <- preProcess(bank_vars, method = c("YeoJohnson"))
# Use predict to transform data
bank_df <- predict(processed_vars, bank_df)
# Plot transformed features
ggplot(bank_df, aes(x=balance)) +
geom_density()
ggplot(bank_df, aes(x=duration)) +
geom_density()
# Create a scaled new feature scaled_hp
poke_df <- poke_df %>%
mutate(scaled_hp = (hp - min(hp)) / (max(hp) - min(hp)))
# Summarize both features
poke_df %>%
select(hp, scaled_hp) %>%
summary()
## hp scaled_hp
## Min. : 1 Min. :0.000
## 1st Qu.: 50 1st Qu.:0.193
## Median : 68 Median :0.264
## Mean : 70 Mean :0.272
## 3rd Qu.: 80 3rd Qu.:0.311
## Max. :255 Max. :1.000
# Use mutate to create column attack_mc
poke_df <- poke_df %>%
mutate(attack_mc = attack - mean(attack))
# Select variables
poke_vars <- poke_df %>%
select(attack, spatk, spdef)
# Use preProcess to mean center variables
processed_vars <- preProcess(poke_vars, method=c("center"))
# Use predict to include tranformed variables into data
poke_df <- predict(processed_vars, poke_df)
# Summarize the three new column scales
poke_df %>%
select(attack, spatk, spdef) %>%
summary()
## attack spatk spdef
## Min. :-74.6 Min. :-64.6 Min. :-52.9
## 1st Qu.:-24.6 1st Qu.:-24.6 1st Qu.:-22.9
## Median : -4.6 Median : -4.6 Median : -2.9
## Mean : 0.0 Mean : 0.0 Mean : 0.0
## 3rd Qu.: 20.4 3rd Qu.: 20.4 3rd Qu.: 17.1
## Max. :110.4 Max. :119.4 Max. :157.1
# Standardize Speed
poke_df <- poke_df %>%
mutate(z_speed = (speed - mean(speed)) / sd(speed))
# Summarize new and original variable
poke_df %>%
select(speed, z_speed) %>%
summary()
## speed z_speed
## Min. : 3.0 Min. :-2.77
## 1st Qu.:20.0 1st Qu.:-0.72
## Median :26.0 Median :-0.01
## Mean :26.1 Mean : 0.00
## 3rd Qu.:32.7 3rd Qu.: 0.79
## Max. :52.7 Max. : 3.20
# Select variables
poke_vars <- poke_df %>%
select(attack, defense, spatk, spdef)
# Create preProcess variable list
processed_vars <- preProcess(poke_vars, method = c("center", "scale"))
# Use predict to assign standardized variables
poke_df <- predict(processed_vars, poke_df)
# Summarize new variables
poke_df %>%
select(attack, defense, spatk, spdef) %>%
summary()
## attack defense spatk spdef
## Min. :-2.38 Min. :-4.39 Min. :-2.02 Min. :-1.89
## 1st Qu.:-0.78 1st Qu.:-0.76 1st Qu.:-0.77 1st Qu.:-0.82
## Median :-0.15 Median : 0.02 Median :-0.14 Median :-0.10
## Mean : 0.00 Mean : 0.00 Mean : 0.00 Mean : 0.00
## 3rd Qu.: 0.65 3rd Qu.: 0.68 3rd Qu.: 0.64 3rd Qu.: 0.61
## Max. : 3.52 Max. : 3.47 Max. : 3.74 Max. : 5.61
Chapter 4 - Advanced Methods
Feature Crossing:
Principal Component Analysis:
Interpreting PCA Output:
Wrap Up:
Example code includes:
# Copy over first 250 records
gender <- stringr::str_trim(c(c(' Male', ' Male', ' Male', ' Male', ' Female', ' Female', ' Female', ' Male', ' Female', ' Male', ' Male', ' Male', ' Female', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Female', ' Male', ' Female', ' Male', ' Male', ' Female', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Female', ' Male', ' Male', ' Male', ' Male', ' Male', ' Female', ' Male', ' Male', ' Male', ' Female', ' Male', ' Male', ' Female', ' Female', ' Female', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Female', ' Female', ' Male', ' Male', ' Male', ' Female', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Female', ' Male', ' Male', ' Male', ' Female', ' Male', ' Female', ' Female', ' Male', ' Male', ' Male', ' Female', ' Male', ' Female', ' Female', ' Female', ' Male', ' Male', ' Male', ' Male', ' Female', ' Male', ' Male', ' Male', ' Female', ' Female', ' Male', ' Male', ' Female', ' Male', ' Female', ' Male', ' Male', ' Male', ' Male', ' Female', ' Female', ' Male', ' Male', ' Male', ' Male', ' Male', ' Female', ' Male', ' Male', ' Male', ' Male', ' Female', ' Female', ' Male', ' Male', ' Male', ' Female', ' Female', ' Male', ' Male', ' Female', ' Male', ' Male', ' Female', ' Male', ' Male', ' Male', ' Female', ' Male', ' Female', ' Male', ' Female', ' Male', ' Female', ' Male', ' Female', ' Female', ' Female', ' Female', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Female', ' Female', ' Female', ' Female', ' Male', ' Female', ' Male', ' Male', ' Male', ' Female', ' Male', ' Male', ' Male', ' Male', ' Male', ' Female', ' Male', ' Male', ' Male', ' Male', ' Male', ' Female', ' Male', ' Male', ' Male', ' Male', ' Female', ' Female', ' Female', ' Male', ' Female', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Female', ' Male', ' Male', ' Female', ' Male', ' Male', ' Female', ' Female', ' Female'), c(' Male', ' Male', ' Female', ' Female', ' Male', ' Male', ' Female', ' Male', ' Male', ' Male', ' Female', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Male', ' Female', ' Female', ' Female', ' Male', ' Female', ' Female', ' Male', ' Female', ' Male', ' Female', ' Male', ' Male', ' Female', ' Male', ' Male', ' Male', ' Male', ' Female', ' Male', ' Male', ' Male', ' Male', ' Male', ' Female', ' Male', ' Male')))
occ1 <- c(' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners', ' Handlers-cleaners', ' Prof-specialty', ' Exec-managerial', ' Other-service', ' Exec-managerial', ' Prof-specialty', ' Exec-managerial', ' Exec-managerial', ' Prof-specialty', ' Adm-clerical', ' Sales', ' Craft-repair', ' Transport-moving', ' Farming-fishing', ' Machine-op-inspct', ' Sales', ' Exec-managerial', ' Prof-specialty', ' Other-service', ' Farming-fishing', ' Transport-moving', ' Tech-support', ' Tech-support', ' Craft-repair', 'unknown', ' Exec-managerial', ' Craft-repair', ' Protective-serv', ' Sales', ' Exec-managerial', ' Adm-clerical', ' Other-service', ' Machine-op-inspct', ' Machine-op-inspct', ' Adm-clerical', ' Sales', ' Prof-specialty', ' Machine-op-inspct', ' Prof-specialty', ' Tech-support', ' Adm-clerical', ' Handlers-cleaners', ' Prof-specialty', ' Machine-op-inspct', ' Exec-managerial', ' Craft-repair', ' Prof-specialty', ' Exec-managerial', ' Other-service', ' Prof-specialty', ' Exec-managerial', ' Exec-managerial', ' Tech-support', ' Machine-op-inspct', ' Other-service', ' Adm-clerical', ' Machine-op-inspct', ' Sales', 'unknown', ' Transport-moving', ' Prof-specialty', ' Tech-support', ' Craft-repair', ' Adm-clerical', ' Adm-clerical', ' Exec-managerial', 'unknown', ' Prof-specialty', ' Sales', ' Sales', ' Machine-op-inspct', ' Prof-specialty', ' Other-service', ' Adm-clerical', 'unknown', ' Other-service', ' Farming-fishing', ' Sales', ' Other-service', ' Other-service', ' Sales', ' Craft-repair', ' Sales', ' Protective-serv', ' Prof-specialty', ' Sales', ' Prof-specialty', ' Prof-specialty', ' Craft-repair', ' Machine-op-inspct', ' Sales', ' Protective-serv', ' Handlers-cleaners', ' Prof-specialty', ' Sales', ' Exec-managerial', ' Other-service', ' Exec-managerial')
occ2 <- c(' Exec-managerial', ' Prof-specialty', ' Tech-support', ' Craft-repair', ' Craft-repair', 'unknown', ' Handlers-cleaners', ' Adm-clerical', ' Handlers-cleaners', ' Sales', ' Prof-specialty', ' Other-service', ' Sales', ' Machine-op-inspct', ' Handlers-cleaners', ' Sales', ' Craft-repair', ' Sales', ' Craft-repair', ' Other-service', ' Exec-managerial', ' Exec-managerial', ' Prof-specialty', ' Other-service', ' Exec-managerial', ' Adm-clerical', ' Adm-clerical', 'unknown', ' Craft-repair', ' Sales', ' Other-service', ' Craft-repair', ' Sales', ' Tech-support', ' Prof-specialty', ' Craft-repair', ' Adm-clerical', ' Sales', ' Craft-repair', ' Craft-repair', ' Sales', ' Other-service', ' Prof-specialty', ' Tech-support', ' Transport-moving', ' Other-service', ' Other-service', ' Craft-repair', 'unknown', ' Adm-clerical', ' Adm-clerical', ' Exec-managerial', ' Craft-repair', 'unknown', ' Craft-repair', ' Handlers-cleaners', ' Sales', ' Craft-repair', ' Other-service', 'unknown', ' Other-service', ' Exec-managerial', ' Exec-managerial', ' Sales', ' Other-service', ' Exec-managerial', ' Protective-serv', ' Handlers-cleaners', ' Prof-specialty', ' Other-service', ' Protective-serv', ' Sales', ' Craft-repair', ' Prof-specialty', ' Sales', ' Craft-repair', ' Handlers-cleaners', ' Other-service', ' Prof-specialty', ' Exec-managerial', ' Adm-clerical', ' Craft-repair', ' Machine-op-inspct', ' Adm-clerical', ' Adm-clerical', ' Exec-managerial')
occ3 <- c( 'unknown', ' Prof-specialty', ' Prof-specialty', ' Machine-op-inspct', ' Machine-op-inspct', ' Craft-repair', ' Tech-support', ' Tech-support', ' Transport-moving', ' Craft-repair', ' Exec-managerial', ' Prof-specialty', ' Sales', ' Prof-specialty', 'unknown', ' Exec-managerial', ' Prof-specialty', ' Adm-clerical', ' Sales', ' Other-service', ' Craft-repair', ' Sales', ' Sales', ' Transport-moving', ' Craft-repair', ' Sales', ' Craft-repair', ' Machine-op-inspct', ' Exec-managerial', ' Sales', ' Sales', ' Prof-specialty', ' Craft-repair', ' Handlers-cleaners', 'unknown', ' Other-service', ' Adm-clerical', ' Machine-op-inspct', ' Sales', 'unknown', ' Farming-fishing', ' Adm-clerical', ' Adm-clerical', ' Transport-moving', ' Sales', ' Adm-clerical', ' Craft-repair', ' Prof-specialty', ' Other-service', ' Adm-clerical', ' Exec-managerial', ' Exec-managerial', ' Transport-moving', ' Prof-specialty', ' Other-service', ' Protective-serv', 'unknown', ' Craft-repair', ' Adm-clerical', ' Adm-clerical', ' Other-service', ' Tech-support', ' Adm-clerical')
occupation <- stringr::str_trim(c(occ1, occ2, occ3))
adult_incomes <- data.frame(gender, occupation, stringsAsFactors = FALSE)
glimpse(adult_incomes)
## Observations: 250
## Variables: 2
## $ gender <chr> "Male", "Male", "Male", "Male", "Female", "Female", "Fem...
## $ occupation <chr> "Adm-clerical", "Exec-managerial", "Handlers-cleaners", ...
# Group the data and create a summary of the counts
adult_incomes %>%
count(occupation, gender) %>%
# Create a grouped bar graph
ggplot(., aes(x=occupation, y=n, fill=gender)) +
geom_bar(stat="identity", position="dodge") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# Create a table of the variables of interest
adult_incomes %>%
select(gender, occupation) %>%
table()
## occupation
## gender Adm-clerical Craft-repair Exec-managerial Farming-fishing
## Female 15 3 11 0
## Male 12 28 19 4
## occupation
## gender Handlers-cleaners Machine-op-inspct Other-service Prof-specialty
## Female 0 4 12 10
## Male 11 11 15 22
## occupation
## gender Protective-serv Sales Tech-support Transport-moving unknown
## Female 0 14 3 1 5
## Male 6 20 8 7 9
# Create a feature cross between gender and occupation
dmy <- dummyVars(~ gender:occupation, data=adult_incomes)
# Create object of your resulting data frame
oh_data <- predict(dmy, adult_incomes)
# Summarize the resulting output
summary(oh_data)
## genderFemale:occupationAdm-clerical genderMale:occupationAdm-clerical
## Min. :0.00 Min. :0.000
## 1st Qu.:0.00 1st Qu.:0.000
## Median :0.00 Median :0.000
## Mean :0.06 Mean :0.048
## 3rd Qu.:0.00 3rd Qu.:0.000
## Max. :1.00 Max. :1.000
## genderFemale:occupationCraft-repair genderMale:occupationCraft-repair
## Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000
## Mean :0.012 Mean :0.112
## 3rd Qu.:0.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000
## genderFemale:occupationExec-managerial genderMale:occupationExec-managerial
## Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000
## Mean :0.044 Mean :0.076
## 3rd Qu.:0.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000
## genderFemale:occupationFarming-fishing genderMale:occupationFarming-fishing
## Min. :0 Min. :0.000
## 1st Qu.:0 1st Qu.:0.000
## Median :0 Median :0.000
## Mean :0 Mean :0.016
## 3rd Qu.:0 3rd Qu.:0.000
## Max. :0 Max. :1.000
## genderFemale:occupationHandlers-cleaners
## Min. :0
## 1st Qu.:0
## Median :0
## Mean :0
## 3rd Qu.:0
## Max. :0
## genderMale:occupationHandlers-cleaners
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.044
## 3rd Qu.:0.000
## Max. :1.000
## genderFemale:occupationMachine-op-inspct
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.016
## 3rd Qu.:0.000
## Max. :1.000
## genderMale:occupationMachine-op-inspct genderFemale:occupationOther-service
## Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000
## Mean :0.044 Mean :0.048
## 3rd Qu.:0.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000
## genderMale:occupationOther-service genderFemale:occupationProf-specialty
## Min. :0.00 Min. :0.00
## 1st Qu.:0.00 1st Qu.:0.00
## Median :0.00 Median :0.00
## Mean :0.06 Mean :0.04
## 3rd Qu.:0.00 3rd Qu.:0.00
## Max. :1.00 Max. :1.00
## genderMale:occupationProf-specialty genderFemale:occupationProtective-serv
## Min. :0.000 Min. :0
## 1st Qu.:0.000 1st Qu.:0
## Median :0.000 Median :0
## Mean :0.088 Mean :0
## 3rd Qu.:0.000 3rd Qu.:0
## Max. :1.000 Max. :0
## genderMale:occupationProtective-serv genderFemale:occupationSales
## Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000
## Mean :0.024 Mean :0.056
## 3rd Qu.:0.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000
## genderMale:occupationSales genderFemale:occupationTech-support
## Min. :0.00 Min. :0.000
## 1st Qu.:0.00 1st Qu.:0.000
## Median :0.00 Median :0.000
## Mean :0.08 Mean :0.012
## 3rd Qu.:0.00 3rd Qu.:0.000
## Max. :1.00 Max. :1.000
## genderMale:occupationTech-support genderFemale:occupationTransport-moving
## Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000
## Mean :0.032 Mean :0.004
## 3rd Qu.:0.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000
## genderMale:occupationTransport-moving genderFemale:occupationunknown
## Min. :0.000 Min. :0.00
## 1st Qu.:0.000 1st Qu.:0.00
## Median :0.000 Median :0.00
## Mean :0.028 Mean :0.02
## 3rd Qu.:0.000 3rd Qu.:0.00
## Max. :1.000 Max. :1.00
## genderMale:occupationunknown
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.036
## 3rd Qu.:0.000
## Max. :1.000
# Create the df
poke_x <- poke_df %>%
select(hp, attack, defense, spatk, spdef, speed)
# Perform PCA
poke_pca <- prcomp(poke_x, center=TRUE, scale=TRUE)
# Calculate the proportion of variance
prop_var <- data.frame(sdev=poke_pca$sdev)
prop_var <- prop_var %>%
mutate(pca_comp = 1:n(), pcVar = sdev^2, propVar_ex = pcVar/sum(pcVar))
# Create a plot of the components and proportion of variance
ggplot(prop_var, aes(pca_comp, propVar_ex, group=1)) +
geom_line() +
geom_point()
# Create a plot of the first two components
library(ggfortify)
## Registered S3 methods overwritten by 'ggfortify':
## method from
## autoplot.Arima forecast
## autoplot.acf forecast
## autoplot.ar forecast
## autoplot.bats forecast
## autoplot.decomposed.ts forecast
## autoplot.ets forecast
## autoplot.forecast forecast
## autoplot.stl forecast
## autoplot.ts forecast
## fitted.ar forecast
## fortify.ts forecast
## residuals.ar forecast
autoplot(poke_pca, data = poke_df)
Chapter 1 - Wrangling Text
Text as Data:
Counting Categorical Data:
Tokenizing and Cleaning:
Example code includes:
twitter_data <- readRDS("./RInputFiles/ch_1_twitter_data.rds")
glimpse(twitter_data)
## Observations: 7,044
## Variables: 6
## $ tweet_id <dbl> 4.77e+17, 4.77e+17, 4.77e+17, 4.77e+17, 4.77e+1...
## $ date <dttm> 2014-06-12 00:07:25, 2014-06-12 00:12:30, 2014...
## $ complaint_label <chr> "Non-Complaint", "Non-Complaint", "Complaint", ...
## $ tweet_text <chr> "1. Haneda. 2. I'll go to a corner st in Tokyo ...
## $ usr_followers_count <dbl> 152, 184, 136, 1, 67, 138, 21, 133, 607, 165, 8...
## $ usr_verified <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE...
# Print twitter_data
twitter_data
## # A tibble: 7,044 x 6
## tweet_id date complaint_label tweet_text usr_followers_c~
## <dbl> <dttm> <chr> <chr> <dbl>
## 1 4.77e17 2014-06-12 00:07:25 Non-Complaint "1. Haned~ 152
## 2 4.77e17 2014-06-12 00:12:30 Non-Complaint "My plane~ 184
## 3 4.77e17 2014-06-12 00:13:56 Complaint "So appar~ 136
## 4 4.77e17 2014-06-12 00:16:09 Non-Complaint "Je suppo~ 1
## 5 4.77e17 2014-06-12 00:17:37 Non-Complaint "Dear @Ce~ 67
## 6 4.77e17 2014-06-12 00:18:49 Complaint "Boo @Del~ 138
## 7 4.77e17 2014-06-12 00:26:42 Non-Complaint "#PALFlie~ 21
## 8 4.77e17 2014-06-12 00:31:08 Complaint "@JetBlue~ 133
## 9 4.77e17 2014-06-12 00:35:27 Non-Complaint "Celebrat~ 607
## 10 4.77e17 2014-06-12 00:46:47 Non-Complaint "Don't do~ 165
## # ... with 7,034 more rows, and 1 more variable: usr_verified <lgl>
# Print just the complaints in twitter_data
twitter_data %>%
filter(complaint_label == "Complaint")
## # A tibble: 1,676 x 6
## tweet_id date complaint_label tweet_text usr_followers_c~
## <dbl> <dttm> <chr> <chr> <dbl>
## 1 4.77e17 2014-06-12 00:13:56 Complaint "So appar~ 136
## 2 4.77e17 2014-06-12 00:18:49 Complaint "Boo @Del~ 138
## 3 4.77e17 2014-06-12 00:31:08 Complaint "@JetBlue~ 133
## 4 4.77e17 2014-06-12 00:49:18 Complaint "@TheReal~ 221
## 5 4.77e17 2014-06-12 00:54:32 Complaint "@America~ 10
## 6 4.77e17 2014-06-12 00:58:36 Complaint "I strong~ 158
## 7 4.77e17 2014-06-12 01:08:40 Complaint "@donclif~ 55
## 8 4.77e17 2014-06-12 01:27:36 Complaint "@USAirwa~ 995
## 9 4.77e17 2014-06-12 02:17:21 Complaint "Just ask~ 7005
## 10 4.77e17 2014-06-12 02:18:16 Complaint "@migs647~ 919
## # ... with 1,666 more rows, and 1 more variable: usr_verified <lgl>
# Start with the data frame
twitter_data %>%
# Group the whether or not the tweet is a complaint
group_by(complaint_label) %>%
# Compute the mean, min, and max follower counts
summarize(avg_followers = mean(usr_followers_count),
min_followers = min(usr_followers_count),
max_followers = max(usr_followers_count)
)
## # A tibble: 2 x 4
## complaint_label avg_followers min_followers max_followers
## <chr> <dbl> <dbl> <dbl>
## 1 Complaint 3234. 0 1259803
## 2 Non-Complaint 4487. 0 2200851
twitter_data %>%
# Filter for just the complaints
filter(complaint_label == "Complaint") %>%
# Count the number of verified and non-verified users
count(usr_verified)
## # A tibble: 2 x 2
## usr_verified n
## <lgl> <int>
## 1 FALSE 1650
## 2 TRUE 26
twitter_data %>%
# Group by whether or not a user is verified
group_by(usr_verified) %>%
summarize(
# Compute the average number of followers
avg_followers = mean(usr_followers_count),
# Count the number of users in each category
n = n()
)
## # A tibble: 2 x 3
## usr_verified avg_followers n
## <lgl> <dbl> <int>
## 1 FALSE 1999. 6927
## 2 TRUE 133849. 117
tidy_twitter <- twitter_data %>%
# Tokenize the twitter data
tidytext::unnest_tokens(word, tweet_text)
tidy_twitter %>%
# Compute word counts
count(word) %>%
# Arrange the counts in descending order
arrange(desc(n))
## # A tibble: 18,600 x 2
## word n
## <chr> <int>
## 1 to 2834
## 2 the 2212
## 3 a 1989
## 4 i 1752
## 5 t.co 1405
## 6 http 1361
## 7 for 1356
## 8 you 1345
## 9 on 1289
## 10 and 1153
## # ... with 18,590 more rows
tidy_twitter <- twitter_data %>%
# Tokenize the twitter data
tidytext::unnest_tokens(word, tweet_text) %>%
# Remove stop words
anti_join(tidytext::stop_words)
## Joining, by = "word"
tidy_twitter %>%
# Filter to keep complaints only
filter(complaint_label == "Complaint") %>%
# Compute word counts and arrange in descending order
count(word) %>%
arrange(desc(n))
## # A tibble: 3,863 x 2
## word n
## <chr> <int>
## 1 flight 459
## 2 united 362
## 3 americanair 294
## 4 usairways 207
## 5 time 167
## 6 delta 141
## 7 service 137
## 8 2 129
## 9 delayed 123
## 10 british_airways 121
## # ... with 3,853 more rows
Chapter 2 - Visualizing Text
Plotting Word Counts:
Improving Word Count Plots:
Faceting Word Count Plots:
Plotting Word Clouds:
Example code includes:
word_counts <- tidy_twitter %>%
filter(complaint_label == "Complaint") %>%
count(word) %>%
# Keep words with count greater than 100
filter(n > 100)
# Create a bar plot using word_counts
ggplot(word_counts, aes(x=word, y=n)) +
geom_col() +
# Flip the plot coordinates
coord_flip()
word_counts <- tidy_twitter %>%
# Only keep the non-complaints
filter(complaint_label == "Non-Complaint") %>%
count(word) %>%
filter(n > 150)
# Create a bar plot using the new word_counts
ggplot(word_counts, aes(x=word, y=n)) +
geom_col() +
coord_flip() +
# Title the plot "Non-Complaint Word Counts"
ggtitle("Non-Complaint Word Counts")
custom_stop_words <- tribble(
# Column names should match stop_words
~word, ~lexicon,
# Add http, win, and t.co as custom stop words
"http", "CUSTOM",
"win", "CUSTOM",
"t.co", "CUSTOM"
)
# Bind the custom stop words to stop_words
stop_words2 <- tidytext::stop_words %>%
bind_rows(custom_stop_words)
word_counts <- tidy_twitter %>%
filter(complaint_label == "Non-Complaint") %>%
count(word) %>%
# Keep terms that occur more than 100 times
filter(n > 100) %>%
# Reorder word as an ordered factor by word counts
mutate(word2 = fct_reorder(word, n))
# Plot the new word column with type factor
ggplot(word_counts, aes(x=word2, y=n)) +
geom_col() +
coord_flip() +
ggtitle("Non-Complaint Word Counts")
word_counts <- tidy_twitter %>%
# Count words by whether or not its a complaint
count(word, complaint_label) %>%
# Group by whether or not its a complaint
group_by(complaint_label) %>%
# Keep the top 20 words
top_n(20, n) %>%
# Ungroup before reordering word as a factor by the count
ungroup() %>%
mutate(word2 = fct_reorder(word, n))
# Include a color aesthetic tied to whether or not its a complaint
ggplot(word_counts, aes(x = word2, y = n, fill = complaint_label)) +
# Don't include the lengend for the column plot
geom_col(show.legend = FALSE) +
# Facet by whether or not its a complaint and make the y-axis free
facet_wrap(~ complaint_label, scales = "free_y") +
# Flip the coordinates and add a title: "Twitter Word Counts"
coord_flip() +
ggtitle("Twitter Word Counts")
# Compute word counts and assign to word_counts
word_counts <- tidy_twitter %>%
count(word)
wordcloud::wordcloud(
# Assign the word column to words
words = word_counts$word,
# Assign the count column to freq
freq = word_counts$n,
max.words = 30
)
# Compute complaint word counts and assign to word_counts
word_counts <- tidy_twitter %>%
filter(complaint_label=="Complaint") %>%
count(word)
# Create a complaint word cloud of the top 50 terms, colored red
wordcloud::wordcloud(
words = word_counts$word,
freq = word_counts$n,
max.words = 50,
colors = "red"
)
Chapter 3 - Sentiment Analysis
Sentiment Dictionaries:
Appending Dictionaries:
Improving Sentiment Analysis:
Example code includes:
# Count the number of words associated with each sentiment in nrc
tidytext::get_sentiments("bing") %>%
count(sentiment) %>%
# Arrange the counts in descending order
arrange(-n)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
# Pull in the nrc dictionary, count the sentiments and reorder them by count
sentiment_counts <- tidytext::get_sentiments("bing") %>%
count(sentiment) %>%
mutate(sentiment2 = fct_reorder(sentiment, n))
# Visualize sentiment_counts using the new sentiment factor column
ggplot(sentiment_counts, aes(x=sentiment2, y=n)) +
geom_col() +
coord_flip() +
# Change the title to "Sentiment Counts in NRC", x-axis to "Counts", and y-axis to "Sentiment"
labs(title = "Sentiment Counts in NRC", x = "Counts", y = "Sentiment")
# Join tidy_twitter and the NRC sentiment dictionary
sentiment_twitter <- tidy_twitter %>%
inner_join(tidytext::get_sentiments("bing"))
## Joining, by = "word"
# Count the sentiments in tidy_twitter
sentiment_twitter %>%
count(sentiment) %>%
# Arrange the sentiment counts in descending order
arrange(-n)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 2888
## 2 positive 2050
word_counts <- tidy_twitter %>%
# Append the NRC dictionary and filter for positive, fear, and trust
inner_join(tidytext::get_sentiments("bing")) %>%
filter(sentiment %in% c("positive", "fear", "trust")) %>%
# Count by word and sentiment and keep the top 10 of each
count(word, sentiment) %>%
group_by(sentiment) %>%
top_n(10, n) %>%
ungroup() %>%
# Create a factor called word2 that has each word ordered by the count
mutate(word2 = fct_reorder(word, n))
## Joining, by = "word"
# Create a bar plot out of the word counts colored by sentiment
ggplot(word_counts, aes(x=word2, y=n, fill=sentiment)) +
geom_col(show.legend = FALSE) +
# Create a separate facet for each sentiment with free axes
facet_wrap(~ sentiment, scales = "free") +
coord_flip() +
# Title the plot "Sentiment Word Counts" with "Words" for the x-axis
labs(title = "Sentiment Word Counts", x = "Words")
tidy_twitter %>%
# Append the NRC sentiment dictionary
inner_join(tidytext::get_sentiments("bing")) %>%
# Count by complaint label and sentiment
count(complaint_label, sentiment) %>%
# Spread the sentiment and count columns
spread(sentiment, n)
## Joining, by = "word"
## # A tibble: 2 x 3
## complaint_label negative positive
## <chr> <int> <int>
## 1 Complaint 1457 395
## 2 Non-Complaint 1431 1655
tidy_twitter %>%
# Append the afinn sentiment dictionary
inner_join(tidytext::get_sentiments("afinn")) %>%
# Group by both complaint label and whether or not the user is verified
group_by(complaint_label, usr_verified) %>%
# Summarize the data with an aggregate_score = sum(score)
summarize(aggregate_score = sum(value)) %>%
# Spread the complaint_label and aggregate_score columns
spread(complaint_label, aggregate_score) %>%
mutate(overall_sentiment = Complaint + `Non-Complaint`)
## Joining, by = "word"
## # A tibble: 2 x 4
## usr_verified Complaint `Non-Complaint` overall_sentiment
## <lgl> <dbl> <dbl> <dbl>
## 1 FALSE -1556 2348 792
## 2 TRUE -12 63 51
sentiment_twitter <- tidy_twitter %>%
# Append the bing sentiment dictionary
inner_join(tidytext::get_sentiments("bing")) %>%
# Count by complaint label and sentiment
count(complaint_label, sentiment) %>%
# Spread the sentiment and count columns
spread(sentiment, n) %>%
# Compute overall_sentiment = positive - negative
mutate(overall_sentiment = positive - negative)
## Joining, by = "word"
# Create a bar plot out of overall sentiment by complaint level, colored by a complaint label factor
ggplot(sentiment_twitter, aes(x=complaint_label, y=overall_sentiment, fill=as.factor(complaint_label))) +
geom_col(show.legend = FALSE) +
coord_flip() +
# Title the plot "Overall Sentiment by Complaint Type," with an "Airline Twitter Data" subtitle
labs(title = "Overall Sentiment by Complaint Type", subtitle = "Airline Twitter Data")
Chapter 4 - Topic Modeling
Latent Dirichlet Allocation:
Document Term Matrices:
Running Topic Models:
Interpreting Topics:
Wrap Up:
Example code includes:
excl_words <- c("t.co", "http", "klm", "united", "americanair", "delta", "de", "southwestair", "usairways",
"jetblue", "british_airways", "amp", "deltaassist", "2", "ryanair", "4", "en",
"aircanada", "el", "emirates", "3", "virginamerica", "alaskaair", "1", "es", "vueling",
"britishairways", "se", "indonesiagaruda", "airfrancefr", "nedmex", "turkishairlines",
"airasia", "20", "flyfrontier", "tamairlines", "5", "30", "6", "10", "taylorcaniff",
"dm", "frontiercare", "ik"
)
# Cast the word counts by tweet into a DTM
dtm_twitter <- tidy_twitter %>%
filter(!(word %in% excl_words)) %>%
count(word, tweet_id) %>%
tidytext::cast_dtm(tweet_id, word, n)
# Run an LDA with 2 topics and a Gibbs sampler
lda_out2 <- topicmodels::LDA(dtm_twitter, k = 2, method = "Gibbs", control = list(seed = 42))
# Tidy the matrix of word probabilities
lda_topics <- lda_out2 %>%
broom::tidy(matrix = "beta")
# Print the output from LDA run
lda_topics
## # A tibble: 35,906 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 _adowaa_ 0.00000357
## 2 2 _adowaa_ 0.0000403
## 3 1 _arzar 0.00000357
## 4 2 _arzar 0.0000403
## 5 1 _austrian 0.00000357
## 6 2 _austrian 0.000406
## 7 1 _bbbb_ 0.00000357
## 8 2 _bbbb_ 0.0000403
## 9 1 _cierratindall 0.0000393
## 10 2 _cierratindall 0.00000366
## # ... with 35,896 more rows
# Start with the topics output from the LDA run
lda_topics %>%
# Arrange the topics by word probabilities in descending order
arrange(-beta)
## # A tibble: 35,906 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 flight 0.0356
## 2 1 time 0.0102
## 3 1 service 0.00868
## 4 1 fly 0.00822
## 5 1 flights 0.00797
## 6 1 plane 0.00725
## 7 1 flying 0.00693
## 8 1 win 0.00640
## 9 1 delayed 0.00615
## 10 1 trip 0.00615
## # ... with 35,896 more rows
# Produce a grouped summary of the LDA output by topic
lda_topics %>%
group_by(topic) %>%
summarize(
# Calculate the sum of the word probabilities
sum = sum(beta),
# Count the number of terms
n = n()
)
## # A tibble: 2 x 3
## topic sum n
## <int> <dbl> <int>
## 1 1 1 17953
## 2 2 1 17953
word_probs <- lda_topics %>%
# Keep the top 10 highest word probabilities by topic
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
# Create term2, a factor ordered by word probability
mutate(term2 = fct_reorder(term, beta))
# Plot term2 and the word probabilities
ggplot(word_probs, aes(x=term2, y=beta)) +
geom_col() +
# Facet the bar plot by topic
facet_wrap(~topic, scales = "free") +
coord_flip()
# Start with the tidied Twitter data
tidy_twitter %>%
filter(!(word %in% excl_words)) %>%
# Count each word used in each tweet
count(word, tweet_id) %>%
# Use the word counts by tweet to create a DTM
tidytext::cast_dtm(tweet_id, word, n)
## <<DocumentTermMatrix (documents: 6944, terms: 17953)>>
## Non-/sparse entries: 50707/124614925
## Sparsity : 100%
## Maximal term length: 44
## Weighting : term frequency (tf)
# Assign the DTM to dtm_twitter
dtm_twitter <- tidy_twitter %>%
filter(!(word %in% excl_words)) %>%
count(word, tweet_id) %>%
# Cast the word counts by tweet into a DTM
tidytext::cast_dtm(tweet_id, word, n)
# Coerce dtm_twitter into a matrix called matrix_twitter
matrix_twitter <- as.matrix(dtm_twitter)
# Print rows 2 through 5 and sample columns
cn <- colnames(matrix_twitter)
matrix_twitter[2:5, match(c("fabulous", "industry", "village", "mistake", "time", "volunteer", "support"), cn)]
## Terms
## Docs fabulous industry village mistake time volunteer
## 478816318784036864 0 0 0 1 0 0
## 477008545637224448 0 0 1 0 0 1
## 477077022695768064 1 1 0 0 0 0
## 478083958534856704 0 0 0 0 1 0
## Terms
## Docs support
## 478816318784036864 0
## 477008545637224448 0
## 477077022695768064 1
## 478083958534856704 0
# Run an LDA with 2 topics and a Gibbs sampler
lda_out <- topicmodels::LDA(dtm_twitter, k = 2, method = "Gibbs", control = list(seed = 42))
# Glimpse the topic model output
glimpse(lda_out)
## Formal class 'LDA_Gibbs' [package "topicmodels"] with 16 slots
## ..@ seedwords : NULL
## ..@ z : int [1:51738] 2 2 2 2 2 1 1 2 2 2 ...
## ..@ alpha : num 25
## ..@ call : language topicmodels::LDA(x = dtm_twitter, k = 2, method = "Gibbs", control = list(seed = 42))
## ..@ Dim : int [1:2] 6944 17953
## ..@ control :Formal class 'LDA_Gibbscontrol' [package "topicmodels"] with 14 slots
## ..@ k : int 2
## ..@ terms : chr [1:17953] "_adowaa_" "_arzar" "_austrian" "_bbbb_" ...
## ..@ documents : chr [1:6944] "486973619952971776" "478816318784036864" "477008545637224448" "477077022695768064" ...
## ..@ beta : num [1:2, 1:17953] -12.5 -10.1 -12.5 -10.1 -12.5 ...
## ..@ gamma : num [1:6944, 1:2] 0.481 0.468 0.55 0.474 0.508 ...
## ..@ wordassignments:List of 5
## .. ..$ i : int [1:50707] 1 1 2 2 2 2 2 2 2 2 ...
## .. ..$ j : int [1:50707] 1 17590 2 2142 9724 10103 10306 10943 12249 12410 ...
## .. ..$ v : num [1:50707] 2 2 2 2 2 1 1 2 2 2 ...
## .. ..$ nrow: int 6944
## .. ..$ ncol: int 17953
## .. ..- attr(*, "class")= chr "simple_triplet_matrix"
## ..@ loglikelihood : num -457342
## ..@ iter : int 2000
## ..@ logLiks : num(0)
## ..@ n : int 51738
# Tidy the matrix of word probabilities
lda_topics <- lda_out %>%
broom::tidy(matrix="beta")
# Arrange the topics by word probabilities in descending order
lda_topics %>%
arrange(-beta)
## # A tibble: 35,906 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 flight 0.0356
## 2 1 time 0.0102
## 3 1 service 0.00868
## 4 1 fly 0.00822
## 5 1 flights 0.00797
## 6 1 plane 0.00725
## 7 1 flying 0.00693
## 8 1 win 0.00640
## 9 1 delayed 0.00615
## 10 1 trip 0.00615
## # ... with 35,896 more rows
# Run an LDA with 3 topics and a Gibbs sampler
lda_out2 <- topicmodels::LDA(dtm_twitter, k = 3, method = "Gibbs", control = list(seed = 42))
# Tidy the matrix of word probabilities
lda_topics2 <- lda_out2 %>%
broom::tidy(matrix = "beta")
# Arrange the topics by word probabilities in descending order
lda_topics2 %>%
arrange(-beta)
## # A tibble: 53,859 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 2 flight 0.0482
## 2 2 time 0.0145
## 3 2 service 0.0123
## 4 1 flights 0.0119
## 5 2 fly 0.0114
## 6 2 plane 0.0103
## 7 1 win 0.00953
## 8 1 trip 0.00916
## 9 2 delayed 0.00888
## 10 1 flying 0.00884
## # ... with 53,849 more rows
# Select the top 15 terms by topic and reorder term
word_probs2 <- lda_topics2 %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
mutate(term2 = fct_reorder(term, beta))
# Plot word_probs2, color and facet based on topic
ggplot(word_probs2, aes(x=term2, y=beta, fill=as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
# Run an LDA with 4 topics and a Gibbs sampler
lda_out3 <- topicmodels::LDA(dtm_twitter, k = 4, method = "Gibbs", control = list(seed = 42))
# Tidy the matrix of word probabilities
lda_topics3 <- lda_out3 %>%
broom::tidy(matrix = "beta")
# Arrange the topics by word probabilities in descending order
lda_topics3 %>%
arrange(-beta)
## # A tibble: 71,812 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 3 flight 0.0632
## 2 3 time 0.0189
## 3 2 service 0.0165
## 4 1 flights 0.0153
## 5 2 fly 0.0153
## 6 3 plane 0.0134
## 7 1 win 0.0123
## 8 1 trip 0.0118
## 9 3 delayed 0.0116
## 10 2 airline 0.0115
## # ... with 71,802 more rows
# Select the top 15 terms by topic and reorder term
word_probs3 <- lda_topics3 %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
mutate(term2 = fct_reorder(term, beta))
# Plot word_probs3, color and facet based on topic
ggplot(word_probs3, aes(x=term2, y=beta, fill=as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
Chapter 1 - Introduction to Data Science
What is Data Science?
Applications of Data Science:
Building a Data Science Team:
Chapter 2 - Data Sources and Risks
Data Sources and Risks:
Solicited Data:
Collecting Additional Data:
Data Storage and Retrieval:
Chapter 3 - Analysis and Visualization
Dashboards:
Ad hoc analysis:
A/B Testing:
Chapter 4 - Prediction
Supervised Machine Learning:
Clustering:
Special Topics in Machine Learning:
Deep Learning and Explainable AI:
Chapter 1 - The Basics
Introduction:
Financial returns:
Basic data types:
Example code includes:
# Addition!
3 + 5
## [1] 8
# Subtraction!
6 - 4
## [1] 2
# Addition
2 + 2
## [1] 4
# Subtraction
6 - 4
## [1] 2
# Multiplication
3 * 4
## [1] 12
# Division
4 / 2
## [1] 2
# Exponentiation
2^4
## [1] 16
# Modulo
7 %% 3
## [1] 1
# Assign 200 to savings
savings <- 200
# Print the value of savings to the console
savings
## [1] 200
# Assign 100 to my_money
my_money <- 100
# Assign 200 to dans_money
dans_money <- 200
# Add my_money and dans_money
my_money + dans_money
## [1] 300
# Add my_money and dans_money again, save the result to our_money
our_money <- my_money + dans_money
# Variables for starting_cash and 5% return during January
starting_cash <- 200
jan_ret <- 5
jan_mult <- 1 + (jan_ret / 100)
# How much money do you have at the end of January?
post_jan_cash <- starting_cash * jan_mult
# Print post_jan_cash
post_jan_cash
## [1] 210
# January 10% return multiplier
jan_ret_10 <- 10
jan_mult_10 <- 1 + (jan_ret_10 / 100)
# How much money do you have at the end of January now?
post_jan_cash_10 <- starting_cash * jan_mult_10
# Print post_jan_cash_10
post_jan_cash_10
## [1] 220
# Starting cash and returns
starting_cash <- 200
jan_ret <- 4
feb_ret <- 5
# Multipliers
jan_mult <- 1 + jan_ret/100
feb_mult <- 1 + feb_ret/100
# Total cash at the end of the two months
total_cash <- starting_cash * jan_mult * feb_mult
# Print total_cash
total_cash
## [1] 218
# Apple's stock price is a numeric
apple_stock <- 150.45
# Bond credit ratings are characters
credit_rating <- "AAA"
# You like the stock market. TRUE or FALSE?
my_answer <- TRUE
# Print my_answer
my_answer
## [1] TRUE
Chapter 2 - Vectors and Matrices
What is a vector?
Vector manipulation:
Matrix - a 2D vector:
Example code includes:
# Another numeric vector
ibm_stock <- c(159.82, 160.02, 159.84)
# Another character vector
finance <- c("stocks", "bonds", "investments")
# A logical vector
logic <- c(TRUE, FALSE, TRUE)
# Vectors of 12 months of returns, and month names
ret <- c(5, 2, 3, 7, 8, 3, 5, 9, 1, 4, 6, 3)
months <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
# Add names to ret
names(ret) <- months
# Print out ret to see the new names!
ret
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 5 2 3 7 8 3 5 9 1 4 6 3
# Vectors of 12 months of returns, and month names
ret <- c(5, 2, 3, 7, 8, 3, 5, 9, 1, 4, 6, 3)
months <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
# Add names to ret
names(ret) <- months
# Print out ret to see the new names!
ret
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 5 2 3 7 8 3 5 9 1 4 6 3
# Weights and returns
micr_ret <- 7
sony_ret <- 9
micr_weight <- .2
sony_weight <- .8
# Portfolio return
portf_ret <- micr_ret * micr_weight + sony_ret * sony_weight
# Weights, returns, and company names
ret <- c(7, 9)
weight <- c(.2, .8)
companies <- c("Microsoft", "Sony")
# Assign company names to your vectors
names(ret) <- companies
names(weight) <- companies
# Multiply the returns and weights together
ret_X_weight <- ret*weight
# Print ret_X_weight
ret_X_weight
## Microsoft Sony
## 1.4 7.2
# Sum to get the total portfolio return
portf_ret <- sum(ret_X_weight)
# Print portf_ret
portf_ret
## [1] 8.6
# Print ret
ret
## Microsoft Sony
## 7 9
# Assign 1/3 to weight
weight <- 1/3
# Create ret_X_weight
ret_X_weight <- ret * weight
# Calculate your portfolio return
portf_ret <- sum(ret_X_weight)
# Vector of length 3 * Vector of length 2?
ret * c(.2, .6)
## Microsoft Sony
## 1.4 5.4
# Vectors of 12 months of returns, and month names
ret <- c(5, 2, 3, 7, 8, 3, 5, 9, 1, 4, 6, 3)
months <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
names(ret) <- months
# First 6 months of returns
ret[1:6]
## Jan Feb Mar Apr May Jun
## 5 2 3 7 8 3
# Just March and May
ret[c("Mar", "May")]
## Mar May
## 3 8
# Omit the first month of returns
ret[-1]
## Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2 3 7 8 3 5 9 1 4 6 3
# A vector of 9 numbers
my_vector <- c(1, 2, 3, 4, 5, 6, 7, 8, 9)
# 3x3 matrix
my_matrix <- matrix(data = my_vector, nrow = 3, ncol = 3)
# Print my_matrix
my_matrix
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
# Filling across using byrow = TRUE
matrix(data = c(2, 3, 4, 5), nrow = 2, ncol = 2, byrow = TRUE)
## [,1] [,2]
## [1,] 2 3
## [2,] 4 5
prices <- c(109.49, 109.9, 109.11, 109.95, 111.03, 112.12, 113.95, 113.3, 115.19, 115.19, 115.82, 115.97, 116.64, 116.95, 117.06, 116.29, 116.52, 117.26, 116.76, 116.73, 115.82, 159.82, 160.02, 159.84, 160.35, 164.79, 165.36, 166.52, 165.5, 168.29, 168.51, 168.02, 166.73, 166.68, 167.6, 167.33, 167.06, 166.71, 167.14, 166.19, 166.6, 165.99, 59.2, 59.25, 60.22, 59.95, 61.37, 61.01, 61.97, 62.17, 62.98, 62.68, 62.58, 62.3, 63.62, 63.54, 63.54, 63.55, 63.24, 63.28, 62.99, 62.9, 62.14)
apple <- prices[1:21]
ibm <- prices[22:42]
micr <- prices[43:63]
# cbind the vectors together
cbind_stocks <- cbind(apple, ibm, micr)
# Print cbind_stocks
cbind_stocks
## apple ibm micr
## [1,] 109 160 59.2
## [2,] 110 160 59.2
## [3,] 109 160 60.2
## [4,] 110 160 60.0
## [5,] 111 165 61.4
## [6,] 112 165 61.0
## [7,] 114 167 62.0
## [8,] 113 166 62.2
## [9,] 115 168 63.0
## [10,] 115 169 62.7
## [11,] 116 168 62.6
## [12,] 116 167 62.3
## [13,] 117 167 63.6
## [14,] 117 168 63.5
## [15,] 117 167 63.5
## [16,] 116 167 63.5
## [17,] 117 167 63.2
## [18,] 117 167 63.3
## [19,] 117 166 63.0
## [20,] 117 167 62.9
## [21,] 116 166 62.1
# rbind the vectors together
rbind_stocks <- rbind(apple, ibm, micr)
# Print rbind_stocks
rbind_stocks
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## apple 109.5 109.9 109.1 110 111.0 112 114 113.3 115 115.2 115.8 116.0 116.6
## ibm 159.8 160.0 159.8 160 164.8 165 167 165.5 168 168.5 168.0 166.7 166.7
## micr 59.2 59.2 60.2 60 61.4 61 62 62.2 63 62.7 62.6 62.3 63.6
## [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21]
## apple 117.0 117.1 116.3 116.5 117.3 117 116.7 115.8
## ibm 167.6 167.3 167.1 166.7 167.1 166 166.6 166.0
## micr 63.5 63.5 63.5 63.2 63.3 63 62.9 62.1
apple_micr_matrix <- cbind(apple, micr)
# View the data
apple_micr_matrix
## apple micr
## [1,] 109 59.2
## [2,] 110 59.2
## [3,] 109 60.2
## [4,] 110 60.0
## [5,] 111 61.4
## [6,] 112 61.0
## [7,] 114 62.0
## [8,] 113 62.2
## [9,] 115 63.0
## [10,] 115 62.7
## [11,] 116 62.6
## [12,] 116 62.3
## [13,] 117 63.6
## [14,] 117 63.5
## [15,] 117 63.5
## [16,] 116 63.5
## [17,] 117 63.2
## [18,] 117 63.3
## [19,] 117 63.0
## [20,] 117 62.9
## [21,] 116 62.1
# Scatter plot of Microsoft vs Apple
plot(apple_micr_matrix)
# Correlation of Apple and IBM
cor(apple, ibm)
## [1] 0.887
# stock matrix
stocks <- cbind(apple, micr, ibm)
# cor() of all three
# cor(apple, micr, ibm)
cor(stocks)
## apple micr ibm
## apple 1.000 0.948 0.887
## micr 0.948 1.000 0.913
## ibm 0.887 0.913 1.000
# Third row
stocks[3, ]
## apple micr ibm
## 109.1 60.2 159.8
# Fourth and fifth row of the ibm column
stocks[4:5, "ibm"]
## [1] 160 165
# apple and micr columns
stocks[, c("apple", "micr")]
## apple micr
## [1,] 109 59.2
## [2,] 110 59.2
## [3,] 109 60.2
## [4,] 110 60.0
## [5,] 111 61.4
## [6,] 112 61.0
## [7,] 114 62.0
## [8,] 113 62.2
## [9,] 115 63.0
## [10,] 115 62.7
## [11,] 116 62.6
## [12,] 116 62.3
## [13,] 117 63.6
## [14,] 117 63.5
## [15,] 117 63.5
## [16,] 116 63.5
## [17,] 117 63.2
## [18,] 117 63.3
## [19,] 117 63.0
## [20,] 117 62.9
## [21,] 116 62.1
Chapter 3 - Data Frames
What is a data frame?
Data frame manipulation:
Present value:
Example code includes:
# Variables
company <- c("A", "A", "A", "B", "B", "B", "B")
cash_flow <- c(1000, 4000, 550, 1500, 1100, 750, 6000)
year <- c(1, 3, 4, 1, 2, 4, 5)
# Data frame
cash <- data.frame(company, cash_flow, year, stringsAsFactors = FALSE)
# Print cash
cash
## company cash_flow year
## 1 A 1000 1
## 2 A 4000 3
## 3 A 550 4
## 4 B 1500 1
## 5 B 1100 2
## 6 B 750 4
## 7 B 6000 5
# Call head() for the first 4 rows
head(cash, 4)
## company cash_flow year
## 1 A 1000 1
## 2 A 4000 3
## 3 A 550 4
## 4 B 1500 1
# Call tail() for the last 3 rows
tail(cash, 3)
## company cash_flow year
## 5 B 1100 2
## 6 B 750 4
## 7 B 6000 5
# Call str()
str(cash)
## 'data.frame': 7 obs. of 3 variables:
## $ company : chr "A" "A" "A" "B" ...
## $ cash_flow: num 1000 4000 550 1500 1100 750 6000
## $ year : num 1 3 4 1 2 4 5
# Fix your column names
names(cash) <- c("company", "cash_flow", "year")
# Print out the column names of cash
colnames(cash)
## [1] "company" "cash_flow" "year"
cash <- data.frame(company=c("A", "A", "A", "B", "B", "B", "B"), cash_flow=c(1000, 4000, 550, 1500, 1100, 750, 6000), year=c(1, 3, 4, 1, 2, 4, 5), stringsAsFactors=FALSE)
# Third row, second column
cash[3, 2]
## [1] 550
# Fifth row of the "year" column
cash[5, "year"]
## [1] 2
# Select the year column
cash$year
## [1] 1 3 4 1 2 4 5
# Select the cash_flow column and multiply by 2
cash$cash_flow * 2
## [1] 2000 8000 1100 3000 2200 1500 12000
# Delete the company column
cash$company <- NULL
# Print cash again
cash
## cash_flow year
## 1 1000 1
## 2 4000 3
## 3 550 4
## 4 1500 1
## 5 1100 2
## 6 750 4
## 7 6000 5
cash$company <- c("A", "A", "A", "B", "B", "B", "B")
cash
## cash_flow year company
## 1 1000 1 A
## 2 4000 3 A
## 3 550 4 A
## 4 1500 1 B
## 5 1100 2 B
## 6 750 4 B
## 7 6000 5 B
# Rows about company B
subset(cash, company == "B")
## cash_flow year company
## 4 1500 1 B
## 5 1100 2 B
## 6 750 4 B
## 7 6000 5 B
# Rows with cash flows due in 1 year
subset(cash, year == 1)
## cash_flow year company
## 1 1000 1 A
## 4 1500 1 B
# Quarter cash flow scenario
cash$quarter_cash <- 0.25 * cash$cash_flow
# Double year scenario
cash$double_year <- cash$year * 2
# Present value of $4000, in 3 years, at 5%
present_value_4k <- 4000 * (1.05 ** -3)
# Present value of all cash flows
cash$present_value <- cash$cash_flow * (1.05 ** -cash$year)
# Print out cash
cash
## cash_flow year company quarter_cash double_year present_value
## 1 1000 1 A 250 2 952
## 2 4000 3 A 1000 6 3455
## 3 550 4 A 138 8 452
## 4 1500 1 B 375 2 1429
## 5 1100 2 B 275 4 998
## 6 750 4 B 188 8 617
## 7 6000 5 B 1500 10 4701
# Total present value of cash
total_pv <- sum(cash$present_value)
# Company B information
cash_B <- subset(cash, company == "B")
# Total present value of cash_B
total_pv_B <- sum(cash_B$present_value)
Chapter 4 - Factors
What is a factor?
Ordering and subsetting factors:
Example code includes:
# credit_rating character vector
credit_rating <- c("BB", "AAA", "AA", "CCC", "AA", "AAA", "B", "BB")
# Create a factor from credit_rating
credit_factor <- factor(credit_rating)
# Print out your new factor
credit_factor
## [1] BB AAA AA CCC AA AAA B BB
## Levels: AA AAA B BB CCC
# Call str() on credit_rating
str(credit_rating)
## chr [1:8] "BB" "AAA" "AA" "CCC" "AA" "AAA" "B" "BB"
# Call str() on credit_factor
str(credit_factor)
## Factor w/ 5 levels "AA","AAA","B",..: 4 2 1 5 1 2 3 4
# Identify unique levels
levels(credit_factor)
## [1] "AA" "AAA" "B" "BB" "CCC"
# Rename the levels of credit_factor
levels(credit_factor) <- c("2A", "3A", "1B", "2B", "3C")
# Print credit_factor
credit_factor
## [1] 2B 3A 2A 3C 2A 3A 1B 2B
## Levels: 2A 3A 1B 2B 3C
# Summarize the character vector, credit_rating
summary(credit_rating)
## AA AAA B BB CCC
## 2 2 1 2 1
# Summarize the factor, credit_factor
summary(credit_factor)
## 2A 3A 1B 2B 3C
## 2 2 1 2 1
# Visualize your factor!
plot(credit_factor)
AAA_rank <- c(31, 48, 100, 53, 85, 73, 62, 74, 42, 38, 97, 61, 48, 86, 44, 9, 43, 18, 62, 38, 23, 37, 54, 80, 78, 93, 47, 100, 22, 22, 18, 26, 81, 17, 98, 4, 83, 5, 6, 52, 29, 44, 50, 2, 25, 19, 15, 42, 30, 27)
# Create 4 buckets for AAA_rank using cut()
AAA_factor <- cut(x = AAA_rank, breaks = c(0, 25, 50, 75, 100))
# Rename the levels
levels(AAA_factor) <- c("low", "medium", "high", "very_high")
# Print AAA_factor
AAA_factor
## [1] medium medium very_high high very_high high high
## [8] high medium medium very_high high medium very_high
## [15] medium low medium low high medium low
## [22] medium high very_high very_high very_high medium very_high
## [29] low low low medium very_high low very_high
## [36] low very_high low low high medium medium
## [43] medium low low low low medium medium
## [50] medium
## Levels: low medium high very_high
# Plot AAA_factor
plot(AAA_factor)
# Use unique() to find unique words
unique(credit_rating)
## [1] "BB" "AAA" "AA" "CCC" "B"
# Create an ordered factor
credit_factor_ordered <- factor(credit_rating, ordered = TRUE, levels = c("AAA", "AA", "BB", "B", "CCC"))
# Plot credit_factor_ordered
plot(credit_factor_ordered)
# Remove the A bonds at positions 3 and 7. Don't drop the A level.
keep_level <- credit_factor[-c(3, 7)]
# Plot keep_level
plot(keep_level)
# Remove the A bonds at positions 3 and 7. Drop the A level.
drop_level <- credit_factor[-c(3, 7), drop=TRUE]
# Plot drop_level
plot(drop_level)
# Variables
credit_rating <- c("AAA", "A", "BB")
bond_owners <- c("Dan", "Tom", "Joe")
# Create the data frame of character vectors, bonds
bonds <- data.frame(credit_rating, bond_owners, stringsAsFactors=FALSE)
# Use str() on bonds
str(bonds)
## 'data.frame': 3 obs. of 2 variables:
## $ credit_rating: chr "AAA" "A" "BB"
## $ bond_owners : chr "Dan" "Tom" "Joe"
# Create a factor column in bonds called credit_factor from credit_rating
bonds$credit_factor <- factor(bonds$credit_rating, ordered = TRUE, levels = c("AAA", "A", "BB"))
# Use str() on bonds again
str(bonds)
## 'data.frame': 3 obs. of 3 variables:
## $ credit_rating: chr "AAA" "A" "BB"
## $ bond_owners : chr "Dan" "Tom" "Joe"
## $ credit_factor: Ord.factor w/ 3 levels "AAA"<"A"<"BB": 1 2 3
Chapter 5 - Lists
What is a list?
List creating functions:
Wrap up:
Example code includes:
# List components
name <- "Apple and IBM"
apple <- c(109.49, 109.90, 109.11, 109.95, 111.03)
ibm <- c(159.82, 160.02, 159.84, 160.35, 164.79)
cor_matrix <- cor(cbind(apple, ibm))
# Create a list
portfolio <- list(name, apple, ibm, cor_matrix)
# View your first list
portfolio
## [[1]]
## [1] "Apple and IBM"
##
## [[2]]
## [1] 109 110 109 110 111
##
## [[3]]
## [1] 160 160 160 160 165
##
## [[4]]
## apple ibm
## apple 1.000 0.913
## ibm 0.913 1.000
# Add names to your portfolio
names(portfolio) <- c("portfolio_name", "apple", "ibm", "correlation")
# Print portfolio
portfolio
## $portfolio_name
## [1] "Apple and IBM"
##
## $apple
## [1] 109 110 109 110 111
##
## $ibm
## [1] 160 160 160 160 165
##
## $correlation
## apple ibm
## apple 1.000 0.913
## ibm 0.913 1.000
# Second and third elements of portfolio
portfolio[c(2, 3)]
## $apple
## [1] 109 110 109 110 111
##
## $ibm
## [1] 160 160 160 160 165
# Use $ to get the correlation data
portfolio$correlation
## apple ibm
## apple 1.000 0.913
## ibm 0.913 1.000
# Add weight: 20% Apple, 80% IBM
portfolio$weight <- c(apple = 0.2, ibm = 0.8)
# Print portfolio
portfolio
## $portfolio_name
## [1] "Apple and IBM"
##
## $apple
## [1] 109 110 109 110 111
##
## $ibm
## [1] 160 160 160 160 165
##
## $correlation
## apple ibm
## apple 1.000 0.913
## ibm 0.913 1.000
##
## $weight
## apple ibm
## 0.2 0.8
# Change the weight variable: 30% Apple, 70% IBM
portfolio$weight <- c(apple = 0.3, ibm = 0.7)
# Print portfolio to see the changes
portfolio
## $portfolio_name
## [1] "Apple and IBM"
##
## $apple
## [1] 109 110 109 110 111
##
## $ibm
## [1] 160 160 160 160 165
##
## $correlation
## apple ibm
## apple 1.000 0.913
## ibm 0.913 1.000
##
## $weight
## apple ibm
## 0.3 0.7
# Take a look at portfolio
portfolio
## $portfolio_name
## [1] "Apple and IBM"
##
## $apple
## [1] 109 110 109 110 111
##
## $ibm
## [1] 160 160 160 160 165
##
## $correlation
## apple ibm
## apple 1.000 0.913
## ibm 0.913 1.000
##
## $weight
## apple ibm
## 0.3 0.7
# Remove the microsoft stock prices from your portfolio
portfolio$microsoft <- NULL
# Define grouping from year
grouping <- cash$year
# Split cash on your new grouping
split_cash <- split(cash, grouping)
# Look at your split_cash list
split_cash
## $`1`
## cash_flow year company quarter_cash double_year present_value
## 1 1000 1 A 250 2 952
## 4 1500 1 B 375 2 1429
##
## $`2`
## cash_flow year company quarter_cash double_year present_value
## 5 1100 2 B 275 4 998
##
## $`3`
## cash_flow year company quarter_cash double_year present_value
## 2 4000 3 A 1000 6 3455
##
## $`4`
## cash_flow year company quarter_cash double_year present_value
## 3 550 4 A 138 8 452
## 6 750 4 B 188 8 617
##
## $`5`
## cash_flow year company quarter_cash double_year present_value
## 7 6000 5 B 1500 10 4701
# Unsplit split_cash to get the original data back.
original_cash <- unsplit(split_cash, grouping)
# Print original_cash
original_cash
## cash_flow year company quarter_cash double_year present_value
## 1 1000 1 A 250 2 952
## 2 4000 3 A 1000 6 3455
## 3 550 4 A 138 8 452
## 4 1500 1 B 375 2 1429
## 5 1100 2 B 275 4 998
## 6 750 4 B 188 8 617
## 7 6000 5 B 1500 10 4701
# Print split_cash
split_cash
## $`1`
## cash_flow year company quarter_cash double_year present_value
## 1 1000 1 A 250 2 952
## 4 1500 1 B 375 2 1429
##
## $`2`
## cash_flow year company quarter_cash double_year present_value
## 5 1100 2 B 275 4 998
##
## $`3`
## cash_flow year company quarter_cash double_year present_value
## 2 4000 3 A 1000 6 3455
##
## $`4`
## cash_flow year company quarter_cash double_year present_value
## 3 550 4 A 138 8 452
## 6 750 4 B 188 8 617
##
## $`5`
## cash_flow year company quarter_cash double_year present_value
## 7 6000 5 B 1500 10 4701
# Print the cash_flow column of B in split_cash
split_cash$B$cash_flow
## NULL
# Set the cash_flow column of company A in split_cash to 0
split_cash$A$cash_flow <- 0
# Use the grouping to unsplit split_cash
cash_no_A <- unsplit(split_cash, grouping)
# Print cash_no_A
cash_no_A
## cash_flow year company quarter_cash double_year present_value
## 1 1000 1 A 250 2 952
## 2 4000 3 A 1000 6 3455
## 3 550 4 A 138 8 452
## 4 1500 1 B 375 2 1429
## 5 1100 2 B 275 4 998
## 6 750 4 B 188 8 617
## 7 6000 5 B 1500 10 4701
# my_matrix and my_factor
my_matrix <- matrix(c(1,2,3,4,5,6), nrow = 2, ncol = 3)
rownames(my_matrix) <- c("Row1", "Row2")
colnames(my_matrix) <- c("Col1", "Col2", "Col3")
my_factor <- factor(c("A", "A", "B"), ordered = T, levels = c("A", "B"))
# attributes of my_matrix
attributes(my_matrix)
## $dim
## [1] 2 3
##
## $dimnames
## $dimnames[[1]]
## [1] "Row1" "Row2"
##
## $dimnames[[2]]
## [1] "Col1" "Col2" "Col3"
# Just the dim attribute of my_matrix
attr(my_matrix, which="dim")
## [1] 2 3
# attributes of my_factor
attributes(my_factor)
## $levels
## [1] "A" "B"
##
## $class
## [1] "ordered" "factor"
Chapter 1 - Dates
Introduction to dates in R:
Date formats and extractor functions:
Example code includes:
# What is the current date?
# Sys.Date()
# What is the current date and time?
# Sys.time()
# Create the variable today
today <- Sys.Date()
# Confirm the class of today
class(today)
## [1] "Date"
# Create crash
crash <- as.Date("2008-09-29")
# Print crash
crash
## [1] "2008-09-29"
# crash as a numeric
as.numeric(crash)
## [1] 14151
# Current time as a numeric
as.numeric(Sys.time())
## [1] 1.57e+09
# Incorrect date format
# as.Date("09/29/2008")
# Create dates from "2017-02-05" to "2017-02-08" inclusive.
dates <- c("2017-02-05", "2017-02-06", "2017-02-07", "2017-02-08")
# Add names to dates
names(dates) <- c("Sunday", "Monday", "Tuesday", "Wednesday")
# Subset dates to only return the date for Monday
dates["Monday"]
## Monday
## "2017-02-06"
# "08,30,30"
as.Date("08,30,1930", format = "%m,%d,%Y")
## [1] "1930-08-30"
# "Aug 30,1930"
as.Date("Aug 30,1930", format = "%b %d,%Y")
## [1] "1930-08-30"
# "30aug1930"
as.Date("30aug1930", format = "%d%b%Y")
## [1] "1930-08-30"
# char_dates
char_dates <- c("1jan17", "2jan17", "3jan17", "4jan17", "5jan17")
# Create dates using as.Date() and the correct format
dates <- as.Date(char_dates, format="%d%b%y")
# Use format() to go from "2017-01-04" -> "Jan 04, 17"
format(dates, format="%b %d, %y")
## [1] "Jan 01, 17" "Jan 02, 17" "Jan 03, 17" "Jan 04, 17" "Jan 05, 17"
# Use format() to go from "2017-01-04" -> "01,04,2017"
format(dates, format="%m,%d,%Y")
## [1] "01,01,2017" "01,02,2017" "01,03,2017" "01,04,2017" "01,05,2017"
# Dates
dates <- as.Date(c("2017-01-01", "2017-01-02", "2017-01-03"))
# Create the origin
origin <- as.Date("1970-01-01")
# Use as.numeric() on dates
as.numeric(dates)
## [1] 17167 17168 17169
# Find the difference between dates and origin
dates-origin
## Time differences in days
## [1] 17167 17168 17169
# dates
dates <- as.Date(c("2017-01-02", "2017-05-03", "2017-08-04", "2017-10-17"))
# Extract the months
months(dates)
## [1] "January" "May" "August" "October"
# Extract the quarters
quarters(dates)
## [1] "Q1" "Q2" "Q3" "Q4"
# dates2
dates2 <- as.Date(c("2017-01-02", "2017-01-03", "2017-01-04", "2017-01-05"))
# Assign the weekdays() of dates2 as the names()
names(dates2) <- weekdays(dates2)
# Print dates2
dates2
## Monday Tuesday Wednesday Thursday
## "2017-01-02" "2017-01-03" "2017-01-04" "2017-01-05"
Chapter 2 - If Statements and Operators
Relational Operators:
Logical Operators:
If statements:
Example code includes:
# Stock prices
apple <- 48.99
micr <- 77.93
# Apple vs Microsoft
apple > micr
## [1] FALSE
# Not equals
apple != micr
## [1] TRUE
# Dates - today and tomorrow
today <- as.Date(Sys.Date())
tomorrow <- as.Date(Sys.Date() + 1)
# Today vs Tomorrow
tomorrow < today
## [1] FALSE
stocks <- data.frame(ibm=c(171, 171, 176, 178), panera=c(217, 261, 214, 212),
date=as.Date(c("2017-01-20", "2017-01-23", "2017-01-24", "2017-01-25"))
)
# Print stocks
stocks
## ibm panera date
## 1 171 217 2017-01-20
## 2 171 261 2017-01-23
## 3 176 214 2017-01-24
## 4 178 212 2017-01-25
# IBM range
stocks$ibm_buy <- (stocks$ibm < 175)
# Panera range
stocks$panera_sell <- (stocks$panera > 213)
# IBM vs Panera
stocks$ibm_vs_panera <- (stocks$ibm > stocks$panera)
# Print stocks
stocks
## ibm panera date ibm_buy panera_sell ibm_vs_panera
## 1 171 217 2017-01-20 TRUE TRUE FALSE
## 2 171 261 2017-01-23 TRUE TRUE FALSE
## 3 176 214 2017-01-24 FALSE TRUE FALSE
## 4 178 212 2017-01-25 FALSE FALSE FALSE
# IBM buy range
stocks$ibm_buy_range <- (stocks$ibm > 171) & (stocks$ibm < 176)
# Panera spikes
stocks$panera_spike <- (stocks$panera < 213.2) | (stocks$panera > 216.5)
# Date range
stocks$good_dates <- (stocks$date > as.Date("2017-01-21")) & (stocks$date < as.Date("2017-01-25"))
# Print stocks
stocks
## ibm panera date ibm_buy panera_sell ibm_vs_panera ibm_buy_range
## 1 171 217 2017-01-20 TRUE TRUE FALSE FALSE
## 2 171 261 2017-01-23 TRUE TRUE FALSE FALSE
## 3 176 214 2017-01-24 FALSE TRUE FALSE FALSE
## 4 178 212 2017-01-25 FALSE FALSE FALSE FALSE
## panera_spike good_dates
## 1 TRUE FALSE
## 2 TRUE TRUE
## 3 FALSE TRUE
## 4 TRUE FALSE
# IBM range
!(stocks$ibm > 176)
## [1] TRUE TRUE TRUE FALSE
# Missing data
missing <- c(24.5, 25.7, NA, 28, 28.6, NA)
# Is missing?
is.na(missing)
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
# Not missing?
!is.na(missing)
## [1] TRUE TRUE FALSE TRUE TRUE FALSE
# Panera range
subset(stocks, panera > 216)
## ibm panera date ibm_buy panera_sell ibm_vs_panera ibm_buy_range
## 1 171 217 2017-01-20 TRUE TRUE FALSE FALSE
## 2 171 261 2017-01-23 TRUE TRUE FALSE FALSE
## panera_spike good_dates
## 1 TRUE FALSE
## 2 TRUE TRUE
# Specific date
subset(stocks, date == as.Date("2017-01-23"))
## ibm panera date ibm_buy panera_sell ibm_vs_panera ibm_buy_range
## 2 171 261 2017-01-23 TRUE TRUE FALSE FALSE
## panera_spike good_dates
## 2 TRUE TRUE
# IBM and Panera joint range
subset(stocks, ibm < 175 & panera < 216.5)
## [1] ibm panera date ibm_buy panera_sell
## [6] ibm_vs_panera ibm_buy_range panera_spike good_dates
## <0 rows> (or 0-length row.names)
# View stocks
stocks
## ibm panera date ibm_buy panera_sell ibm_vs_panera ibm_buy_range
## 1 171 217 2017-01-20 TRUE TRUE FALSE FALSE
## 2 171 261 2017-01-23 TRUE TRUE FALSE FALSE
## 3 176 214 2017-01-24 FALSE TRUE FALSE FALSE
## 4 178 212 2017-01-25 FALSE FALSE FALSE FALSE
## panera_spike good_dates
## 1 TRUE FALSE
## 2 TRUE TRUE
## 3 FALSE TRUE
## 4 TRUE FALSE
# Weekday investigation
stocks$weekday <- weekdays(stocks$date)
# View stocks again
stocks
## ibm panera date ibm_buy panera_sell ibm_vs_panera ibm_buy_range
## 1 171 217 2017-01-20 TRUE TRUE FALSE FALSE
## 2 171 261 2017-01-23 TRUE TRUE FALSE FALSE
## 3 176 214 2017-01-24 FALSE TRUE FALSE FALSE
## 4 178 212 2017-01-25 FALSE FALSE FALSE FALSE
## panera_spike good_dates weekday
## 1 TRUE FALSE Friday
## 2 TRUE TRUE Monday
## 3 FALSE TRUE Tuesday
## 4 TRUE FALSE Wednesday
# Remove missing data
stocks_no_NA <- subset(stocks, !is.na(apple))
# Apple and Microsoft joint range
subset(stocks_no_NA, apple > 117 | micr > 63)
## ibm panera date ibm_buy panera_sell ibm_vs_panera ibm_buy_range
## 1 171 217 2017-01-20 TRUE TRUE FALSE FALSE
## 2 171 261 2017-01-23 TRUE TRUE FALSE FALSE
## 3 176 214 2017-01-24 FALSE TRUE FALSE FALSE
## 4 178 212 2017-01-25 FALSE FALSE FALSE FALSE
## panera_spike good_dates weekday
## 1 TRUE FALSE Friday
## 2 TRUE TRUE Monday
## 3 FALSE TRUE Tuesday
## 4 TRUE FALSE Wednesday
# micr
micr <- 48.55
# Fill in the blanks
if( micr < 55 ) {
print("Buy!")
}
## [1] "Buy!"
# micr
micr <- 57.44
# Fill in the blanks
if( micr < 55 ) {
print("Buy!")
} else {
print("Do nothing!")
}
## [1] "Do nothing!"
# micr
micr <- 105.67
# Fill in the blanks
if( micr < 55 ) {
print("Buy!")
} else if( micr >= 55 & micr < 75 ){
print("Do nothing!")
} else {
print("Sell!")
}
## [1] "Sell!"
# micr
micr <- 105.67
shares <- 1
# Fill in the blanks
if( micr < 55 ) {
print("Buy!")
} else if( micr >= 55 & micr < 75 ) {
print("Do nothing!")
} else {
if( shares >= 1 ) {
print("Sell!")
} else {
print("Not enough shares to sell!")
}
}
## [1] "Sell!"
stocks <- data.frame(apple=c(109.49, 109.9, 109.11, 109.95, 111.03, 112.12, 113.95, 113.3, 115.19, 115.19, 115.82, 115.97, 116.64, 116.95, 117.06, 116.29, 116.52, 117.26, 116.76, 116.73, 115.82),
micr=c(59.2, 59.25, 60.22, 59.95, 61.37, 61.01, 61.97, 62.17, 62.98, 62.68, 62.58, 62.3, 63.62, 63.54, 63.54, 63.55, 63.24, 63.28, 62.99, 62.9, 62.14),
date=as.Date("2016-12-01") + c(7*rep(0:2, each=5) + c(0, 1, 4, 5, 6), 21, 22, 26, 27, 28, 29)
)
# Microsoft test
stocks$micr_buy <- ifelse(test = (stocks$micr > 60 & stocks$micr < 62), yes = 1, no = 0)
# Apple test
stocks$apple_date <- ifelse(test = (stocks$apple > 117), yes = stocks$date, no = NA)
# Print stocks
stocks
## apple micr date micr_buy apple_date
## 1 109 59.2 2016-12-01 0 NA
## 2 110 59.2 2016-12-02 0 NA
## 3 109 60.2 2016-12-05 1 NA
## 4 110 60.0 2016-12-06 0 NA
## 5 111 61.4 2016-12-07 1 NA
## 6 112 61.0 2016-12-08 1 NA
## 7 114 62.0 2016-12-09 1 NA
## 8 113 62.2 2016-12-12 0 NA
## 9 115 63.0 2016-12-13 0 NA
## 10 115 62.7 2016-12-14 0 NA
## 11 116 62.6 2016-12-15 0 NA
## 12 116 62.3 2016-12-16 0 NA
## 13 117 63.6 2016-12-19 0 NA
## 14 117 63.5 2016-12-20 0 NA
## 15 117 63.5 2016-12-21 0 17156
## 16 116 63.5 2016-12-22 0 NA
## 17 117 63.2 2016-12-23 0 NA
## 18 117 63.3 2016-12-27 0 17162
## 19 117 63.0 2016-12-28 0 NA
## 20 117 62.9 2016-12-29 0 NA
## 21 116 62.1 2016-12-30 0 NA
# Change the class() of apple_date.
class(stocks$apple_date) <- "Date"
# Print stocks again
stocks
## apple micr date micr_buy apple_date
## 1 109 59.2 2016-12-01 0 <NA>
## 2 110 59.2 2016-12-02 0 <NA>
## 3 109 60.2 2016-12-05 1 <NA>
## 4 110 60.0 2016-12-06 0 <NA>
## 5 111 61.4 2016-12-07 1 <NA>
## 6 112 61.0 2016-12-08 1 <NA>
## 7 114 62.0 2016-12-09 1 <NA>
## 8 113 62.2 2016-12-12 0 <NA>
## 9 115 63.0 2016-12-13 0 <NA>
## 10 115 62.7 2016-12-14 0 <NA>
## 11 116 62.6 2016-12-15 0 <NA>
## 12 116 62.3 2016-12-16 0 <NA>
## 13 117 63.6 2016-12-19 0 <NA>
## 14 117 63.5 2016-12-20 0 <NA>
## 15 117 63.5 2016-12-21 0 2016-12-21
## 16 116 63.5 2016-12-22 0 <NA>
## 17 117 63.2 2016-12-23 0 <NA>
## 18 117 63.3 2016-12-27 0 2016-12-27
## 19 117 63.0 2016-12-28 0 <NA>
## 20 117 62.9 2016-12-29 0 <NA>
## 21 116 62.1 2016-12-30 0 <NA>
Chapter 3 - Loops
Repeat loops:
While loops:
For loops:
Example code includes:
# Stock price
stock_price <- 126.34
repeat {
# New stock price
stock_price <- stock_price * runif(1, .985, 1.01)
print(stock_price)
# Check
if(stock_price < 125) {
print("Stock price is below 125! Buy it while it's cheap!")
break
}
}
## [1] 127
## [1] 126
## [1] 126
## [1] 126
## [1] 127
## [1] 127
## [1] 128
## [1] 126
## [1] 125
## [1] 125
## [1] 125
## [1] "Stock price is below 125! Buy it while it's cheap!"
# Stock price
stock_price <- 67.55
repeat {
# New stock price
stock_price <- stock_price * .995
# Check
if(stock_price < 66) {
print("Stock price is below 66! Buy it while it's cheap!")
break
}
print(stock_price)
}
## [1] 67.2
## [1] 66.9
## [1] 66.5
## [1] 66.2
## [1] "Stock price is below 66! Buy it while it's cheap!"
# Initial debt
debt <- 5000
# While loop to pay off your debt
while (debt > 0) {
debt <- debt - 500
print(paste("Debt remaining", debt))
}
## [1] "Debt remaining 4500"
## [1] "Debt remaining 4000"
## [1] "Debt remaining 3500"
## [1] "Debt remaining 3000"
## [1] "Debt remaining 2500"
## [1] "Debt remaining 2000"
## [1] "Debt remaining 1500"
## [1] "Debt remaining 1000"
## [1] "Debt remaining 500"
## [1] "Debt remaining 0"
debt <- 5000 # initial debt
i <- 0 # x axis counter
x_axis <- i # x axis
y_axis <- debt # y axis
# Initial plot
plot(x_axis, y_axis, xlim = c(0,10), ylim = c(0,5000))
# Graph your debt
while (debt > 0) {
# Updating variables
debt <- debt - 500
i <- i + 1
x_axis <- c(x_axis, i)
y_axis <- c(y_axis, debt)
# Next plot
plot(x_axis, y_axis, xlim = c(0,10), ylim = c(0,5000))
}
# debt and cash
debt <- 5000
cash <- 4000
# Pay off your debt...if you can!
while (debt > 0) {
debt <- debt - 500
cash <- cash - 500
print(paste("Debt remaining:", debt, "and Cash remaining:", cash))
if (cash == 0) {
print("You ran out of cash!")
break
}
}
## [1] "Debt remaining: 4500 and Cash remaining: 3500"
## [1] "Debt remaining: 4000 and Cash remaining: 3000"
## [1] "Debt remaining: 3500 and Cash remaining: 2500"
## [1] "Debt remaining: 3000 and Cash remaining: 2000"
## [1] "Debt remaining: 2500 and Cash remaining: 1500"
## [1] "Debt remaining: 2000 and Cash remaining: 1000"
## [1] "Debt remaining: 1500 and Cash remaining: 500"
## [1] "Debt remaining: 1000 and Cash remaining: 0"
## [1] "You ran out of cash!"
# Sequence
seq <- c(1:10)
# Print loop
for (value in seq) {
print(value)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
# A sum variable
sum <- 0
# Sum loop
for (value in seq) {
sum <- sum + value
print(sum)
}
## [1] 1
## [1] 3
## [1] 6
## [1] 10
## [1] 15
## [1] 21
## [1] 28
## [1] 36
## [1] 45
## [1] 55
stock <- data.frame(apple=c(109.49, 109.9, 109.11, 109.95, 111.03, 112.12, 113.95, 113.3, 115.19, 115.19, 115.82, 115.97, 116.64, 116.95, 117.06, 116.29, 116.52, 117.26, 116.76, 116.73, 115.82),
micr=c(59.2, 59.25, 60.22, 59.95, 61.37, 61.01, 61.97, 62.17, 62.98, 62.68, 62.58, 62.3, 63.62, 63.54, 63.54, 63.55, 63.24, 63.28, 62.99, 62.9, 62.14),
date=as.Date("2016-12-01") + c(7*rep(0:2, each=5) + c(0, 1, 4, 5, 6), 21, 22, 26, 27, 28, 29)
)
# Loop over stock rows
for (row in 1:nrow(stock)) {
price <- stock[row, "apple"]
date <- stock[row, "date"]
if(price > 116) {
print(paste("On", date,
"the stock price was", price))
} else {
print(paste("The date:", date,
"is not an important day!"))
}
}
## [1] "The date: 2016-12-01 is not an important day!"
## [1] "The date: 2016-12-02 is not an important day!"
## [1] "The date: 2016-12-05 is not an important day!"
## [1] "The date: 2016-12-06 is not an important day!"
## [1] "The date: 2016-12-07 is not an important day!"
## [1] "The date: 2016-12-08 is not an important day!"
## [1] "The date: 2016-12-09 is not an important day!"
## [1] "The date: 2016-12-12 is not an important day!"
## [1] "The date: 2016-12-13 is not an important day!"
## [1] "The date: 2016-12-14 is not an important day!"
## [1] "The date: 2016-12-15 is not an important day!"
## [1] "The date: 2016-12-16 is not an important day!"
## [1] "On 2016-12-19 the stock price was 116.64"
## [1] "On 2016-12-20 the stock price was 116.95"
## [1] "On 2016-12-21 the stock price was 117.06"
## [1] "On 2016-12-22 the stock price was 116.29"
## [1] "On 2016-12-23 the stock price was 116.52"
## [1] "On 2016-12-27 the stock price was 117.26"
## [1] "On 2016-12-28 the stock price was 116.76"
## [1] "On 2016-12-29 the stock price was 116.73"
## [1] "The date: 2016-12-30 is not an important day!"
# Print out corr
corr <- cor(stock[, c("apple", "micr")])
corr
## apple micr
## apple 1.000 0.948
## micr 0.948 1.000
# Create a nested loop
for(row in 1:nrow(corr)) {
for(col in 1:ncol(corr)) {
print(paste(colnames(corr)[col], "and", rownames(corr)[row],
"have a correlation of", corr[row,col]))
}
}
## [1] "apple and apple have a correlation of 1"
## [1] "micr and apple have a correlation of 0.947701010494998"
## [1] "apple and micr have a correlation of 0.947701010494998"
## [1] "micr and micr have a correlation of 1"
# Print apple
# apple
# Loop through apple. Next if NA. Break if above 117.
# for (value in apple) {
# if(is.na(value)) {
# print("Skipping NA")
# next
# }
#
# if(value > 117) {
# print("Time to sell!")
# break
# } else {
# print("Nothing to do here!")
# }
# }
Chapter 4 - Functions
What are functions?
Writing functions:
Packages:
Example code includes:
# subset help
# ?subset
# Sys.time help
# ?Sys.time
# Round 5.4
round(5.4)
## [1] 5
# Round 5.4 with 1 decimal place
round(5.4, 1)
## [1] 5.4
# numbers
numbers <- c(.002623, pi, 812.33345)
# Round numbers to 3 decimal places
round(numbers, 3)
## [1] 0.003 3.142 812.333
apple <- c(109.49, 109.9, 109.11, 109.95, 111.03, 112.12, 113.95, 113.3, 115.19, 115.19, 115.82, 115.97, 116.64, 116.95, 117.06, 116.29, 116.52, 117.26, 116.76, 116.73, 115.82)
ibm <- c(159.82, 160.02, 159.84, 160.35, 164.79, 165.36, 166.52, 165.5, 168.29, 168.51, 168.02, 166.73, 166.68, 167.6, 167.33, 167.06, 166.71, 167.14, 166.19, 166.6, 165.99)
micr <- c(59.2, 59.25, 60.22, 59.95, 61.37, 61.01, 61.97, 62.17, 62.98, 62.68, 62.58, 62.3, 63.62, 63.54, 63.54, 63.55, 63.24, 63.28, 62.99, 62.9, 62.14)
# cbind() the stocks
stocks <- cbind(apple, ibm, micr)
# cor() to create the correlation matrix
cor(stocks)
## apple ibm micr
## apple 1.000 0.887 0.948
## ibm 0.887 1.000 0.913
## micr 0.948 0.913 1.000
# All at once! Nest cbind() inside of cor()
cor(cbind(apple, ibm, micr))
## apple ibm micr
## apple 1.000 0.887 0.948
## ibm 0.887 1.000 0.913
## micr 0.948 0.913 1.000
# Percent to decimal function
percent_to_decimal <- function(percent) { percent/100 }
# Use percent_to_decimal() on 6
percent_to_decimal(6)
## [1] 0.06
# Example percentage
pct <- 8
# Use percent_to_decimal() on pct
percent_to_decimal(pct)
## [1] 0.08
# Percent to decimal function
percent_to_decimal <- function(percent, digits = 2) {
decimal <- percent / 100
round(decimal, digits)
}
# percents
percents <- c(25.88, 9.045, 6.23)
# percent_to_decimal() with default digits
percent_to_decimal(percents)
## [1] 0.26 0.09 0.06
# percent_to_decimal() with digits = 4
percent_to_decimal(percents, digits=4)
## [1] 0.2588 0.0904 0.0623
# Present value function
pv <- function(cash_flow, i, year) {
# Discount multiplier
mult <- 1 + percent_to_decimal(i)
# Present value calculation
cash_flow * mult ^ -year
}
# Calculate a present value
pv(1200, 7, 3)
## [1] 980
library(tidyquant)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following object is masked _by_ '.GlobalEnv':
##
## origin
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following object is masked from 'package:igraph':
##
## %--%
## The following object is masked from 'package:base':
##
## date
## Loading required package: PerformanceAnalytics
##
## Attaching package: 'PerformanceAnalytics'
## The following objects are masked from 'package:e1071':
##
## kurtosis, skewness
## The following object is masked from 'package:graphics':
##
## legend
## Loading required package: quantmod
## Loading required package: TTR
##
## Attaching package: 'TTR'
## The following object is masked from 'package:lavaan':
##
## growth
## Version 0.4-0 included new data defaults. See ?getSymbols.
##
## Attaching package: 'quantmod'
## The following object is masked from 'package:sem':
##
## specifyModel
## == Need to Learn tidyquant? ======================================================================================
## Business Science offers a 1-hour course - Learning Lab #9: Performance Analysis & Portfolio Optimization with tidyquant!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
# Pull Apple stock data
apple <- tidyquant::tq_get("AAPL", get = "stock.prices", from = "2007-01-03", to = "2017-06-05")
# Take a look at what it returned
head(apple)
## # A tibble: 6 x 7
## date open high low close volume adjusted
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2007-01-03 12.3 12.4 11.7 12.0 309579900 10.4
## 2 2007-01-04 12.0 12.3 12.0 12.2 211815100 10.6
## 3 2007-01-05 12.3 12.3 12.1 12.2 208685400 10.6
## 4 2007-01-08 12.3 12.4 12.2 12.2 199276700 10.6
## 5 2007-01-09 12.4 13.3 12.2 13.2 837324600 11.5
## 6 2007-01-10 13.5 14.0 13.4 13.9 738220000 12.1
# Plot the stock price over time
plot(apple$date, apple$adjusted, type = "l")
# Calculate daily stock returns for the adjusted price
apple <- tidyquant::tq_mutate(data = apple, select = adjusted, mutate_fun = dailyReturn)
# Sort the returns from least to greatest
sorted_returns <- sort(apple$daily.returns)
# Plot them
plot(sorted_returns)
Chapter 5 - Apply
Why use apply?
sapply() - simplify:
vapply() - select output type:
Wrap up:
Example code includes:
stock_return <- tibble::tibble(apple=c(0.3745, -0.7188, 0.7699, 0.9823, 0.9817, 1.6322, -0.5704, 1.6681, 0, 0.5469, 0.1295, 0.5777, 0.2658, 0.0941, -0.6578, 0.1978, 0.6351, -0.4264, -0.0257, -0.7796),
ibm=c(0.1251, -0.1125, 0.3191, 2.7689, 0.3459, 0.7015, -0.6125, 1.6858, 0.1307, -0.2908, -0.7678, -0.03, 0.552, -0.1611, -0.1614, -0.2095, 0.2579, -0.5684, 0.2467, -0.3661),
micr=c(0.0845, 1.6371, -0.4484, 2.3686, -0.5866, 1.5735, 0.3227, 1.3029, -0.4763, -0.1595, -0.4474, 2.1188, -0.1257, 0, 0.0157, -0.4878, 0.0633, -0.4583, -0.1429, -1.2083)
)
# Print stock_return
stock_return
## # A tibble: 20 x 3
## apple ibm micr
## <dbl> <dbl> <dbl>
## 1 0.374 0.125 0.0845
## 2 -0.719 -0.112 1.64
## 3 0.770 0.319 -0.448
## 4 0.982 2.77 2.37
## 5 0.982 0.346 -0.587
## 6 1.63 0.702 1.57
## 7 -0.570 -0.612 0.323
## 8 1.67 1.69 1.30
## 9 0 0.131 -0.476
## 10 0.547 -0.291 -0.160
## 11 0.130 -0.768 -0.447
## 12 0.578 -0.03 2.12
## 13 0.266 0.552 -0.126
## 14 0.0941 -0.161 0
## 15 -0.658 -0.161 0.0157
## 16 0.198 -0.210 -0.488
## 17 0.635 0.258 0.0633
## 18 -0.426 -0.568 -0.458
## 19 -0.0257 0.247 -0.143
## 20 -0.780 -0.366 -1.21
# lapply to change percents to decimal
lapply(stock_return, FUN = percent_to_decimal)
## $apple
## [1] 0.00 -0.01 0.01 0.01 0.01 0.02 -0.01 0.02 0.00 0.01 0.00 0.01
## [13] 0.00 0.00 -0.01 0.00 0.01 0.00 0.00 -0.01
##
## $ibm
## [1] 0.00 0.00 0.00 0.03 0.00 0.01 -0.01 0.02 0.00 0.00 -0.01 0.00
## [13] 0.01 0.00 0.00 0.00 0.00 -0.01 0.00 0.00
##
## $micr
## [1] 0.00 0.02 0.00 0.02 -0.01 0.02 0.00 0.01 0.00 0.00 0.00 0.02
## [13] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 -0.01
# Print stock_return
stock_return
## # A tibble: 20 x 3
## apple ibm micr
## <dbl> <dbl> <dbl>
## 1 0.374 0.125 0.0845
## 2 -0.719 -0.112 1.64
## 3 0.770 0.319 -0.448
## 4 0.982 2.77 2.37
## 5 0.982 0.346 -0.587
## 6 1.63 0.702 1.57
## 7 -0.570 -0.612 0.323
## 8 1.67 1.69 1.30
## 9 0 0.131 -0.476
## 10 0.547 -0.291 -0.160
## 11 0.130 -0.768 -0.447
## 12 0.578 -0.03 2.12
## 13 0.266 0.552 -0.126
## 14 0.0941 -0.161 0
## 15 -0.658 -0.161 0.0157
## 16 0.198 -0.210 -0.488
## 17 0.635 0.258 0.0633
## 18 -0.426 -0.568 -0.458
## 19 -0.0257 0.247 -0.143
## 20 -0.780 -0.366 -1.21
# lapply to get the average returns
lapply(stock_return, FUN=mean)
## $apple
## [1] 0.284
##
## $ibm
## [1] 0.193
##
## $micr
## [1] 0.247
# Sharpe ratio
sharpe <- function(returns) {
(mean(returns) - .0003) / sd(returns)
}
# lapply to get the sharpe ratio
lapply(stock_return, FUN=sharpe)
## $apple
## [1] 0.396
##
## $ibm
## [1] 0.237
##
## $micr
## [1] 0.248
# sharpe
sharpe <- function(returns, rf = 0.0003) {
(mean(returns) - rf) / sd(returns)
}
# First lapply()
lapply(stock_return, FUN=sharpe, rf=0.0004)
## $apple
## [1] 0.396
##
## $ibm
## [1] 0.236
##
## $micr
## [1] 0.248
# Second lapply()
lapply(stock_return, FUN=sharpe, rf=0.0009)
## $apple
## [1] 0.395
##
## $ibm
## [1] 0.236
##
## $micr
## [1] 0.248
# lapply() on stock_return
lapply(stock_return, FUN=sharpe)
## $apple
## [1] 0.396
##
## $ibm
## [1] 0.237
##
## $micr
## [1] 0.248
# sapply() on stock_return
sapply(stock_return, FUN=sharpe)
## apple ibm micr
## 0.396 0.237 0.248
# sapply() on stock_return with optional arguments
sapply(stock_return, FUN=sharpe, simplify=FALSE, USE.NAMES=FALSE)
## $apple
## [1] 0.396
##
## $ibm
## [1] 0.237
##
## $micr
## [1] 0.248
# Market crash with as.Date()
market_crash <- list(dow_jones_drop = 777.68, date = as.Date("2008-09-28"))
# Find the classes with sapply()
sapply(market_crash, FUN=class)
## dow_jones_drop date
## "numeric" "Date"
# Market crash with as.POSIXct()
market_crash2 <- list(dow_jones_drop = 777.68, date = as.POSIXct("2008-09-28"))
# Find the classes with lapply()
lapply(market_crash2, FUN=class)
## $dow_jones_drop
## [1] "numeric"
##
## $date
## [1] "POSIXct" "POSIXt"
# Find the classes with sapply()
sapply(market_crash2, FUN=class)
## $dow_jones_drop
## [1] "numeric"
##
## $date
## [1] "POSIXct" "POSIXt"
# Market crash with as.POSIXct()
market_crash2 <- list(dow_jones_drop = 777.68, date = as.POSIXct("2008-09-28"))
# Find the classes with sapply()
sapply(market_crash2, FUN=class)
## $dow_jones_drop
## [1] "numeric"
##
## $date
## [1] "POSIXct" "POSIXt"
# Find the classes with vapply() - this has been commented out because it is designed to crash
# vapply(market_crash2, FUN=class, FUN.VALUE=character(1))
# Sharpe ratio for all stocks
vapply(stock_return, FUN=sharpe, FUN.VALUE = numeric(1))
## apple ibm micr
## 0.396 0.237 0.248
# Summarize Apple
summary(stock_return$apple)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.780 -0.126 0.232 0.284 0.669 1.668
# Summarize all stocks
vapply(stock_return, FUN=summary, FUN.VALUE = numeric(6))
## apple ibm micr
## Min. -0.780 -0.7678 -1.2083
## 1st Qu. -0.126 -0.2298 -0.4509
## Median 0.232 0.0475 -0.0629
## Mean 0.284 0.1927 0.2473
## 3rd Qu. 0.669 0.3258 0.5677
## Max. 1.668 2.7689 2.3686
# Max and min
vapply(stock_return, FUN = function(x) { c(max(x), min(x)) }, FUN.VALUE = numeric(2))
## apple ibm micr
## [1,] 1.67 2.769 2.37
## [2,] -0.78 -0.768 -1.21
Chapter 1 - Introduction and Downloading Data
Overview - Joshua Ulrich is the author of several R packages including TTR, xts, quantmod, quantstrat, blotter:
Introduction to Quandl:
Finding and downloading data from internet sources:
Example code includes:
# Import QQQ data from Yahoo! Finance
quantmod::getSymbols("QQQ", auto.assign=TRUE)
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
## [1] "QQQ"
# Look at the structure of the object getSymbols created
str(QQQ)
## An 'xts' object on 2007-01-03/2019-10-18 containing:
## Data: num [1:3222, 1:6] 43.5 43.3 44 43.9 44 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:6] "QQQ.Open" "QQQ.High" "QQQ.Low" "QQQ.Close" ...
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## List of 2
## $ src : chr "yahoo"
## $ updated: POSIXct[1:1], format: "2019-10-21 09:59:37"
# Look at the first few rows of QQQ
head(QQQ)
## QQQ.Open QQQ.High QQQ.Low QQQ.Close QQQ.Volume QQQ.Adjusted
## 2007-01-03 43.5 44.1 42.5 43.2 1.68e+08 38.6
## 2007-01-04 43.3 44.2 43.2 44.1 1.37e+08 39.3
## 2007-01-05 44.0 44.0 43.5 43.8 1.39e+08 39.1
## 2007-01-08 43.9 44.1 43.6 43.9 1.06e+08 39.1
## 2007-01-09 44.0 44.3 43.6 44.1 1.22e+08 39.3
## 2007-01-10 44.0 44.7 43.8 44.6 1.21e+08 39.8
# Import QQQ data from Alpha Vantage
# quantmod::getSymbols("QQQ", src="av")
# Look at the structure of QQQ
# str(QQQ)
# Import GDP data from FRED
quantmod::getSymbols("GDP", src="FRED", auto.assign=TRUE)
## [1] "GDP"
# Look at the structure of GDP
str(GDP)
## An 'xts' object on 1947-01-01/2019-04-01 containing:
## Data: num [1:290, 1] 243 246 250 260 266 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr "GDP"
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## List of 2
## $ src : chr "FRED"
## $ updated: POSIXct[1:1], format: "2019-10-21 09:59:37"
# There are two arguments that will make getSymbols() return the data:
# Set auto.assign = FALSE.
# Set env = NULL.
# The two methods are functionally equivalent, but I encourage you to use the first method because the auto.assign argument describes the behavior better
# Your future self will be more likely to remember what auto.assign = FALSE means than what env = NULL means
# Assign SPY data to 'spy' using auto.assign argument
spy <- quantmod::getSymbols("SPY", auto.assign=FALSE)
# Look at the structure of the 'spy' object
str(spy)
## An 'xts' object on 2007-01-03/2019-10-18 containing:
## Data: num [1:3222, 1:6] 142 141 141 141 141 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:6] "SPY.Open" "SPY.High" "SPY.Low" "SPY.Close" ...
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## List of 2
## $ src : chr "yahoo"
## $ updated: POSIXct[1:1], format: "2019-10-21 09:59:38"
# Assign JNJ data to 'jnj' using env argument
jnj <- quantmod::getSymbols("JNJ", env=NULL, auto.assign=TRUE)
# Look at the structure of the 'jnj' object
str(jnj)
## An 'xts' object on 2007-01-03/2019-10-18 containing:
## Data: num [1:3222, 1:6] 66.1 65.9 66.8 66.5 66.6 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:6] "JNJ.Open" "JNJ.High" "JNJ.Low" "JNJ.Close" ...
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## List of 2
## $ src : chr "yahoo"
## $ updated: POSIXct[1:1], format: "2019-10-21 09:59:39"
# Import GDP data from FRED
gdp <- Quandl::Quandl(code="FRED/GDP")
# Look at the structure of the object returned by Quandl
str(gdp)
## 'data.frame': 290 obs. of 2 variables:
## $ Date : Date, format: "2019-04-01" "2019-01-01" ...
## $ Value: num 21340 21099 20898 20750 20510 ...
## - attr(*, "freq")= chr "quarterly"
# Import GDP data from FRED as xts
gdp_xts <- Quandl::Quandl(code="FRED/GDP", type="xts")
# Look at the structure of gdp_xts
str(gdp_xts)
## An 'xts' object on 1947 Q1/2019 Q2 containing:
## Data: num [1:290, 1] 243 246 250 260 266 ...
## Indexed by objects of class: [yearqtr] TZ: UTC
## xts Attributes:
## NULL
# Import GDP data from FRED as zoo
gdp_zoo <- Quandl::Quandl(code="FRED/GDP", type="zoo")
# Look at the structure of gdp_zoo
str(gdp_zoo)
## 'zooreg' series from 1947 Q1 to 2019 Q2
## Data: num [1:290] 243 246 250 260 266 ...
## Index: 'yearqtr' num [1:290] 1947 Q1 1947 Q2 1947 Q3 1947 Q4 ...
## Frequency: 4
# Create an object containing the Pfizer ticker symbol
symbol <- "PFE"
# Use getSymbols to import the data
quantmod::getSymbols(symbol, auto.assign=TRUE)
## [1] "PFE"
# Look at the first few rows of data
head(PFE)
## PFE.Open PFE.High PFE.Low PFE.Close PFE.Volume PFE.Adjusted
## 2007-01-03 26.1 26.4 26.0 26.3 40644800 15.6
## 2007-01-04 26.4 26.6 26.3 26.4 32246200 15.7
## 2007-01-05 26.5 26.6 26.2 26.3 31353500 15.6
## 2007-01-08 26.3 26.4 25.9 26.2 43223500 15.5
## 2007-01-09 26.2 26.3 26.0 26.2 31304200 15.6
## 2007-01-10 26.1 26.3 26.0 26.2 34547300 15.6
# Create a currency_pair object
currency_pair <- "GBP/CAD"
# Load British Pound to Canadian Dollar exchange rate data
quantmod::getSymbols(currency_pair, src="oanda", auto.assign=TRUE)
## [1] "GBP/CAD"
# Examine object using str()
str(GBPCAD)
## An 'xts' object on 2019-04-25/2019-10-20 containing:
## Data: num [1:179, 1] 1.74 1.74 1.74 1.74 1.74 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr "GBP.CAD"
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## List of 2
## $ src : chr "oanda"
## $ updated: POSIXct[1:1], format: "2019-10-21 09:59:43"
# Try to load data from 190 days ago
# quantmod::getSymbols(currency_pair, from = Sys.Date() - 190, to = Sys.Date(), src = "oanda")
# Drives Warning: Oanda only provides historical data for the past 180 days. Symbol: GBP/CAD
# Create a series_name object
series_name <- "UNRATE"
# Load the data using getSymbols
quantmod::getSymbols(series_name, src="FRED", auto.assign=TRUE)
## [1] "UNRATE"
# Create a quandl_code object
quandl_code <- paste0("FRED/", series_name)
# Load the data using Quandl
unemploy_rate <- Quandl::Quandl(quandl_code)
Chapter 2 - Extracting and Transforming Data
Extracting specific columns:
Loading and transforming multiple instruments:
Example code includes:
# Look at the head of DC
head(PFE)
## PFE.Open PFE.High PFE.Low PFE.Close PFE.Volume PFE.Adjusted
## 2007-01-03 26.1 26.4 26.0 26.3 40644800 15.6
## 2007-01-04 26.4 26.6 26.3 26.4 32246200 15.7
## 2007-01-05 26.5 26.6 26.2 26.3 31353500 15.6
## 2007-01-08 26.3 26.4 25.9 26.2 43223500 15.5
## 2007-01-09 26.2 26.3 26.0 26.2 31304200 15.6
## 2007-01-10 26.1 26.3 26.0 26.2 34547300 15.6
# Extract the close column
pfe_close <- quantmod::Cl(PFE)
# Look at the head of dc_close
head(pfe_close)
## PFE.Close
## 2007-01-03 26.3
## 2007-01-04 26.4
## 2007-01-05 26.3
## 2007-01-08 26.2
## 2007-01-09 26.2
## 2007-01-10 26.2
# Extract the volume column
pfe_volume <- quantmod::Vo(PFE)
# Look at the head of dc_volume
head(pfe_volume)
## PFE.Volume
## 2007-01-03 40644800
## 2007-01-04 32246200
## 2007-01-05 31353500
## 2007-01-08 43223500
## 2007-01-09 31304200
## 2007-01-10 34547300
# Extract the high, low, and close columns
pfe_hlc <- quantmod::HLC(PFE)
# Look at the head of dc_hlc
head(pfe_hlc)
## PFE.High PFE.Low PFE.Close
## 2007-01-03 26.4 26.0 26.3
## 2007-01-04 26.6 26.3 26.4
## 2007-01-05 26.6 26.2 26.3
## 2007-01-08 26.4 25.9 26.2
## 2007-01-09 26.3 26.0 26.2
## 2007-01-10 26.3 26.0 26.2
# Extract the open, high, low, close, and volume columns
pfe_ohlcv <- quantmod::OHLCV(PFE)
# Look at the head of dc_ohlcv
head(pfe_ohlcv)
## PFE.Open PFE.High PFE.Low PFE.Close PFE.Volume
## 2007-01-03 26.1 26.4 26.0 26.3 40644800
## 2007-01-04 26.4 26.6 26.3 26.4 32246200
## 2007-01-05 26.5 26.6 26.2 26.3 31353500
## 2007-01-08 26.3 26.4 25.9 26.2 43223500
## 2007-01-09 26.2 26.3 26.0 26.2 31304200
## 2007-01-10 26.1 26.3 26.0 26.2 34547300
# Download CME data for CL and BZ as an xts object
# oil_data <- Quandl::Quandl(code = c("CME/CLH2016", "CME/BZH2016"), type = "xts")
# Look at the column names of the oil_data object
# colnames(oil_data)
# Extract the Open price for CLH2016
# cl_open <- quantmod::getPrice(oil_data, symbol = "CLH2016", prefer = "Open$")
# Look at January, 2016 using xts' ISO-8601 subsetting
# cl_open["2016-01"]
# CL and BZ Quandl codes
quandl_codes <- c("CME/CLH2016","CME/BZH2016")
# Download quarterly CL and BZ prices
# qtr_price <- Quandl::Quandl(quandl_codes, type="xts", collapse="quarterly")
# View the high prices for both series
# quantmod::Hi(qtr_price)
# Download quarterly CL and BZ returns
# qtr_return <- Quandl::Quandl(quandl_codes, type="xts", collapse="quarterly", transform="rdiff")
# View the settle price returns for both series
# quantmod::getPrice(qtr_return, prefer="Settle")
# One paradigm you can use in the quantmod workflow involves environments
# Store all your data in one environment
# Then you can use eapply() to call a function on each object in the environment, much like what lapply() does for each element of a list
# Also like lapply(), eapply() returns a list
# Then you can merge all the elements of the list into one object by using do.call(), which is like having R programmatically type and run a command for you
# Instead of typing merge(my_list[[1]], my_list[[2]]], ...), you can type do.call(merge, my_list)
# Call head on each object in data_env using eapply
# data_list <- eapply(data_env, FUN=head)
# Merge all the list elements into one xts object
# data_merged <- do.call(merge, data_list)
# Ensure the columns are ordered: open, high, low, close
# data_ohlc <- quantmod::OHLC(data_merged)
# Symbols
symbols <- c("AAPL", "MSFT", "IBM")
# Create new environment
data_env <- new.env()
# Load symbols into data_env
quantmod::getSymbols(symbols, env=data_env, auto.assign=TRUE)
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
## [1] "AAPL" "MSFT" "IBM"
# Extract the close column from each object and combine into one xts object
close_data <- do.call(merge, eapply(data_env, quantmod::Cl))
# View the head of close_data
head(close_data)
## AAPL.Close IBM.Close MSFT.Close
## 2007-01-03 12.0 97.3 29.9
## 2007-01-04 12.2 98.3 29.8
## 2007-01-05 12.2 97.4 29.6
## 2007-01-08 12.2 98.9 29.9
## 2007-01-09 13.2 100.1 30.0
## 2007-01-10 13.9 98.9 29.7
Chapter 3 - Managing Data from Multiple Sources
Setting default arguments for getSymbols():
Setting per-instrument default arguments:
Handling instrument symbols that clash or are not valid R names:
000001.SS)Example code includes:
# DO NOT RUN - no API Key for data
# Set the default to pull data from Alpha Vantage
quantmod::setDefaults(getSymbols, src="av")
# Get GOOG data
quantmod::getSymbols("GOOG")
# Verify the data was actually pulled from Alpha Vantage
str(GOOG)
# Look at getSymbols.yahoo arguments
args(getSymbols.yahoo)
# Set default 'from' value for getSymbols.yahoo
quantmod::setDefaults(getSymbols.yahoo, from = "2000-01-01")
# Confirm defaults were set correctly
quantmod::getDefaults("getSymbols.yahoo")
# Changing the default source for one instrument is useful if multiple sources use the same symbol for different instruments
# For example, getSymbols("CP", src = "yahoo") would load Canadian Pacific Railway data from the New York Stock Exchange
# But getSymbols("CP", src = "FRED") would load Corporate Profits After Tax from the U.S. Bureau of Economic Analysis
# You can use setSymbolLookup() to specify the default data source for an instrument
# In this exercise, you will learn how to make getSymbols("CP") load the corporate profit data from FRED instead of the railway stock data from Yahoo Finance.
# setSymbolLookup() can take any number of name = value pairs, where name is the symbol and value is a named list of getSymbols() arguments for that one symbol
# Look at the first few rows of CP
head(CP)
# Set the source for CP to FRED
quantmod::setSymbolLookup(CP="FRED")
# Load CP data again
quantmod::getSymbols("CP")
# Look at the first few rows of CP
head(CP)
# Save symbol lookup table
quantmod::saveSymbolLookup("my_symbol_lookup.rda")
# Set default source for CP to "yahoo"
quantmod::setSymbolLookup(CP="yahoo")
# Verify the default source is "yahoo"
quantmod::getSymbolLookup("CP")
# Load symbol lookup table
quantmod::loadSymbolLookup("my_symbol_lookup.rda")
# Verify the default source is "FRED"
quantmod::getSymbolLookup("CP")
# Load BRK-A data
quantmod::getSymbols("BRK-A")
# Use backticks and head() to look at the loaded data
head(`BRK-A`)
# Use get() to assign the BRK-A data to an object named BRK.A
BRK.A <- get("BRK-A")
# Create BRK.A object
BRK.A <- quantmod::getSymbols("BRK-A", auto.assign=FALSE)
# Create col_names object with the column names of BRK.A
col_names <- colnames(BRK.A)
# Set BRK.A column names to syntactically valid names
colnames(BRK.A) <- make.names(col_names)
# Set name for BRK-A to BRK.A
quantmod::setSymbolLookup("BRK.A" = list(name="BRK-A"))
# Set name for T (AT&T) to ATT
quantmod::setSymbolLookup("ATT" = list(name="T"))
# Load BRK.A and ATT data
quantmod::getSymbols(c("BRK.A", "ATT"))
Chapter 4 - Aligning Data with Different Periodicities
Making irregular data regular:
Aggregating to lower frequency:
Aggregating and combining intra-day data:
Example code includes:
# DO NOT HAVE DATA
# Extract the start date of the series
start_date <- start(irregular_xts)
# Extract the end date of the series
end_date <- end(irregular_xts)
# Create a regular date sequence
regular_index <- seq(start_date, end_date, by="day")
# Create a zero-width xts object
regular_xts <- xts(, order.by=regular_index)
# Merge irregular_xts and regular_xts
merged_xts <- merge(irregular_xts, regular_xts)
# Look at the first few rows of merged_xts
head(merged_xts)
# Use the fill argument to fill NA with their previous value
merged_filled_xts <- merge(irregular_xts, regular_xts, fill=na.locf)
# Look at the first few rows of merged_filled_xts
head(merged_filled_xts)
# Aggregate DFF to monthly
monthly_fedfunds <- apply.monthly(DFF, FUN=mean)
# Convert index to yearmon
index(monthly_fedfunds) <- as.yearmon(index(monthly_fedfunds))
# Merge FEDFUNDS with the monthly aggregate
merged_fedfunds <- merge(FEDFUNDS, monthly_fedfunds)
# Look at the first few rows of the merged object
head(merged_fedfunds)
# Look at the first few rows of merged_fedfunds
head(merged_fedfunds)
# Fill NA forward
merged_fedfunds_locf <- na.locf(merged_fedfunds)
# Extract index values containing last day of month
aligned_last_day <- merged_fedfunds_locf[index(monthly_fedfunds)]
# Fill NA backward
merged_fedfunds_locb <- na.locf(merged_fedfunds, fromLast=TRUE)
# Extract index values containing first day of month
aligned_first_day <- merged_fedfunds_locb[index(FEDFUNDS)]
# Extract index weekdays
index_weekdays <- .indexwday(DFF)
# Find locations of Wednesdays
wednesdays <- which(index_weekdays == 3)
# Create custom end points
end_points <- c(0, wednesdays, nrow(DFF))
# Calculate weekly mean using custom end points
weekly_mean <- period.apply(DFF, end_points, FUN=mean)
# Create merged object with a Europe/London timezone
tz_london <- merge(london, chicago)
# Look at tz_london structure
str(tz_london)
# Create merged object with a America/Chicago timezone
tz_chicago <- merge(chicago, london)
# Look at tz_chicago structure
str(tz_chicago)
# Create a regular date-time sequence
regular_index <- seq(as.POSIXct("2010-01-04 09:00"), as.POSIXct("2010-01-08 16:00"), by = "30 min")
# Create a zero-width xts object
regular_xts <- xts(, order.by=regular_index)
# Merge irregular_xts and regular_xts, filling NA with their previous value
merged_xts <- merge(irregular_xts, regular_xts, fill=na.locf)
# Subset to trading day (9AM - 4PM)
trade_day <- merged_xts["T09:00/T16:00"]
# Split trade_day into days
daily_list <- split(trade_day , f = "days")
# Use lapply to call na.locf for each day in daily_list
daily_filled <- lapply(daily_list, FUN = na.locf)
# Use do.call to rbind the results
filled_by_trade_day <- do.call(rbind, daily_filled)
# Convert raw prices to 5-second prices
xts_5sec <- to.period(intraday_xts, period = "seconds", k = 5)
# Convert raw prices to 10-minute prices
xts_10min <- to.period(intraday_xts, period = "minutes", k = 10)
# Convert raw prices to 1-hour prices
xts_1hour <- to.period(intraday_xts, period = "hours", k = 1)
Chapter 5 - Importing Text Data, Adjusting for Corporate Actions
Importing text files:
Checking for weirdness - missing values and corporate actions:
Adjusting financial time series:
Wrap up:
Example code includes:
# DO NOT RUN DUE TO NOT HAVING DATA
library(quantmod)
getSymbols("AMZN", src="yahoo")
str(AMZN)
write.zoo(AMZN, "./RInputFiles/AMZN.csv")
rm(AMZN)
# Load AMZN.csv
getSymbols("./RInputFiles/AMZN.csv", src="csv")
# Look at AMZN structure
str(AMZN)
# Import AMZN.csv using read.zoo
amzn_zoo <- read.zoo("./RInputFiles/AMZN.csv", sep = ",", header = TRUE)
# Convert to xts
amzn_xts <- as.xts(amzn_zoo)
# Look at the first few rows of amzn_xts
head(amzn_xts)
# Read data with read.csv
une_data <- read.csv("UNE.csv", nrows = 5)
# Look at the structure of une_data
str(une_data)
# Read data with read.zoo, specifying index columns
une_zoo <- read.zoo("UNE.csv", index.column = c("Date", "Time"), sep = ",", header = TRUE)
# Look at first few rows of data
head(une_zoo)
# Read data with read.csv
two_symbols_data <- read.csv("two_symbols.csv", nrows = 5)
# Look at the structure of two_symbols_data
str(two_symbols_data)
# Read data with read.zoo, specifying index columns
two_symbols_zoo <- read.zoo("two_symbols.csv", split = c("Symbol", "Type"), sep = ",", header = TRUE)
# Look at first few rows of data
head(two_symbols_zoo)
# fill NA using last observation carried forward
locf <- na.locf(DGS10)
# fill NA using linear interpolation
approx <- na.approx(DGS10)
# fill NA using spline interpolation
spline <- na.spline(DGS10)
# merge into one object
na_filled <- merge(locf, approx, spline)
# plot combined object
plot(na_filled, col = c("black", "red", "green"))
# Look at the last few rows of AAPL data
tail(AAPL)
# Plot close price
plot(Cl(AAPL))
# Plot adjusted close price
plot(Ad(AAPL))
# Look at first few rows of aapl_raw
head(aapl_raw)
# Look at first few rows of aapl_split
head(aapl_split_adjusted)
# Plot difference between adjusted close and split-adjusted close
plot(Ad(aapl_raw) - Cl(aapl_split_adjusted))
# Plot difference between volume from the raw and split-adjusted sources
plot(Vo(aapl_raw) - Vo(aapl_split_adjusted))
# Look at first few rows of AAPL
head(AAPL)
# Adjust AAPL for splits and dividends
aapl_adjusted <- adjustOHLC(AAPL)
# Look at first few rows of aapl_adjusted
head(aapl_adjusted)
# The previous exercise taught you how to adjust raw historical OHLC prices for splits and dividends using adjustOHLC()
# But adjustOHLC() only works for OHLC data
# It will not work if you only have close prices. adjustOHLC() also does not return any of the split or dividend data it uses.
# You will need the dates and values for each split and dividend if you want to adjust a non-OHLC price series, or if you simply want to analyze the raw split and dividend data.
# You can download the split and dividend data from Yahoo Finance using the quantmod functions getSplits() and getDividends(), respectively
# Note that the historical dividend data from Yahoo Finance is adjusted for splits
# If you want to download unadjusted dividend data, you need to set split.adjust = FALSE in your call to getDividends()
# Download AAPL split data
splits <- getSplits("AAPL")
# Print the splits object
splits
# Download AAPL dividend data
dividends <- getDividends("AAPL")
# Look at the first few rows of dividends
head(dividends)
# Download unadjusted AAPL dividend data
raw_dividends <- getDividends("AAPL", split.adjust=FALSE)
# Look at the first few rows of raw_dividends
head(raw_dividends)
# Remember that adjustOHLC() only works for OHLC data
# If you only have close prices, you can use the adjRatios() function to calculate the split and dividend adjustment ratios
# The adjRatios() function has three arguments: splits, dividends, and close
# It returns an xts object with two columns, "Split" and "Div", that contain the split and dividend adjustment ratios, respectively
# adjRatios() needs split data in order to calculate the split adjustment ratio
# You provide split data via the splits argument
# To calculate the dividend adjustment ratio, you need to supply raw dividends and raw prices to adjRatios(), using the dividends and close arguments, respectively
# Once you have the split and dividend adjustment ratios, calculating the adjusted price is simple
# You just have to multiply the unadjusted price by both the split and dividend adjustment ratios
# Calculate split and dividend adjustment ratios
ratios <- adjRatios(splits = splits, dividends = raw_dividends, close = Cl(AAPL))
# Calculate adjusted close for AAPL
aapl_adjusted <- Cl(AAPL) * ratios[, "Split"] * ratios[, "Div"]
# Look at first few rows of Yahoo adjusted close
head(Ad(AAPL))
# Look at first few rows of aapl_adjusted
head(aapl_adjusted)
Chapter 1 - The Building Blocks
Introduction:
Portfolio Weights:
Portfolio Return:
Performance Analytics:
Example code includes:
# load("./RInputFiles/sp500.RData")
# load("./RInputFiles/aapl_msft.RData")
# load("./RInputFiles/eq_prices.RData")
prices <- readRDS("./RInputFiles/prices.rds")
# Load package PerformanceAnalytics
library(PerformanceAnalytics)
ko <- prices$KO
ge <- prices$GE
# Define ko_pep
ko_ge <- ko / ge
# Make a time series plot of ko_pep
zoo::plot.zoo(ko_ge)
# Add as a reference, a horizontal line at 1
abline(h=1)
# Define the vector values
values <- c(4000, 4000, 2000)
# Define the vector weights
weights <- values / sum(values)
# Print the resulting weights
weights
## [1] 0.4 0.4 0.2
# Define marketcaps
marketcaps <- c(5, 8, 9, 20, 25, 100, 100, 500, 700, 2000)
# Compute the weights
weights <- marketcaps / sum(marketcaps)
# Inspect summary statistics
summary(weights)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001 0.003 0.018 0.100 0.115 0.577
# Create a barplot of weights
barplot(weights)
# Vector of initial value of the assets
in_values <- c(1000, 5000, 2000)
# Vector of final values of the assets
fin_values <- c(1100, 4500, 3000)
# Weights as the proportion of total value invested in each assets
weights <- in_values / sum(in_values)
# Vector of simple returns of the assets
returns <- (fin_values - in_values)/in_values
# Compute portfolio return using the portfolio return formula
preturns <- sum(weights*returns)
# Print the first and last six rows of prices
head(prices)
## AA AAPL AXP BA BAC CAT CVX DD DIS GE HD HPQ INTC
## 1990-12-31 4.53 1.35 3.45 13.8 2.74 3.26 7.44 7.68 6.43 2.36 2.04 1.00 0.841
## 1991-01-31 5.12 1.74 3.80 15.0 3.37 3.47 7.31 7.58 6.87 2.63 2.29 1.23 1.000
## 1991-02-28 5.10 1.79 4.01 14.7 3.50 3.81 7.80 7.95 7.83 2.81 2.58 1.47 1.044
## 1991-03-31 5.21 2.13 4.82 14.3 4.20 3.34 7.98 7.85 7.55 2.88 2.86 1.58 1.022
## 1991-04-30 5.40 1.72 4.21 14.0 4.47 3.35 8.10 8.80 7.31 2.93 3.04 1.62 1.076
## 1991-05-31 5.69 1.48 4.32 15.0 5.11 3.64 7.74 10.09 7.37 3.22 3.49 1.71 1.218
## IBM JNJ JPM KO MCD MMM MRK MSFT NKE PFE PG TRV UTX
## 1990-12-31 18.3 5.10 1.65 3.40 4.58 10.7 6.47 0.724 1.02 1.67 5.99 7.68 3.47
## 1991-01-31 20.5 5.44 2.02 3.56 4.49 10.6 6.59 0.944 1.22 1.86 5.52 7.81 3.44
## 1991-02-28 21.0 5.87 2.53 3.83 4.98 11.1 7.35 0.998 1.22 2.19 5.65 8.11 3.64
## 1991-03-31 18.6 6.85 2.71 3.98 5.48 11.1 7.64 1.021 1.17 2.23 5.92 8.61 3.55
## 1991-04-30 16.8 6.75 3.19 3.87 5.29 11.2 7.80 0.952 1.21 2.27 5.87 8.72 3.37
## 1991-05-31 17.5 6.50 3.33 4.20 5.54 12.1 8.60 1.056 1.01 2.47 5.96 8.00 3.47
## VZ WMT XOM T
## 1990-12-31 7.84 5.46 6.23 4.64
## 1991-01-31 7.17 5.95 6.22 4.39
## 1991-02-28 7.24 6.38 6.73 4.51
## 1991-03-31 7.59 7.00 7.14 4.71
## 1991-04-30 7.48 7.31 7.26 4.54
## 1991-05-31 7.05 7.74 7.19 4.40
tail(prices)
## AA AAPL AXP BA BAC CAT CVX DD DIS GE HD HPQ INTC IBM
## 2015-07-31 9.77 120 75.1 141 17.7 76.8 85.2 54.7 119 25.5 115 13.5 28.3 158
## 2015-08-31 9.38 112 75.7 129 16.2 74.7 79.0 50.9 101 24.2 115 12.4 28.1 145
## 2015-09-30 9.59 109 73.2 129 15.5 63.8 77.0 47.6 102 24.8 114 11.4 29.7 142
## 2015-10-31 8.86 118 72.6 146 16.7 72.1 88.7 62.6 113 28.5 122 12.0 33.3 137
## 2015-11-30 9.32 118 71.0 144 17.3 71.8 90.2 66.9 113 29.5 133 12.3 34.5 138
## 2015-12-31 9.83 105 68.9 143 16.8 67.1 88.8 66.2 105 30.9 132 11.7 34.1 136
## JNJ JPM KO MCD MMM MRK MSFT NKE PFE PG TRV UTX VZ
## 2015-07-31 98.0 67.1 40.1 97.5 148 57.5 45.8 57.2 35.1 75.4 104.3 98.3 45.2
## 2015-08-31 92.6 62.8 38.4 93.6 140 52.5 42.9 55.5 31.6 69.4 97.9 90.3 44.4
## 2015-09-30 92.0 59.7 39.5 97.0 140 48.5 43.7 61.2 30.8 70.7 98.4 87.8 42.0
## 2015-10-31 99.6 63.4 41.7 110.5 155 53.7 51.9 65.2 33.2 75.7 111.7 97.1 45.8
## 2015-11-30 100.5 65.7 42.3 113.3 155 52.1 54.0 65.8 32.4 74.2 113.3 95.3 44.4
## 2015-12-31 102.0 65.1 42.6 117.3 150 52.4 55.1 62.3 32.0 78.7 112.2 95.4 45.2
## WMT XOM T
## 2015-07-31 70.4 77.1 33.4
## 2015-08-31 63.7 73.9 31.9
## 2015-09-30 63.8 73.0 31.3
## 2015-10-31 56.3 81.3 32.6
## 2015-11-30 57.9 80.9 32.8
## 2015-12-31 60.8 77.2 33.5
# Create the variable returns using Return.calculate()
returns <- Return.calculate(prices)
# Print the first six rows of returns. Note that the first observation is NA, because there is no prior price.
head(returns)
## AA AAPL AXP BA BAC CAT CVX DD
## 1990-12-31 NA NA NA NA NA NA NA NA
## 1991-01-31 0.12818 0.2907 0.1030 0.0882 0.230 0.06548 -0.0172 -0.0136
## 1991-02-28 -0.00388 0.0337 0.0549 -0.0181 0.040 0.09799 0.0661 0.0498
## 1991-03-31 0.02144 0.1878 0.2031 -0.0259 0.198 -0.12357 0.0233 -0.0133
## 1991-04-30 0.03743 -0.1912 -0.1272 -0.0266 0.065 0.00118 0.0158 0.1212
## 1991-05-31 0.05370 -0.1433 0.0250 0.0768 0.142 0.08661 -0.0452 0.1464
## DIS GE HD HPQ INTC IBM JNJ JPM KO
## 1990-12-31 NA NA NA NA NA NA NA NA NA
## 1991-01-31 0.0681 0.1155 0.1197 0.2196 0.1883 0.1217 0.0662 0.2209 0.0484
## 1991-02-28 0.1397 0.0704 0.1272 0.1994 0.0437 0.0256 0.0781 0.2571 0.0744
## 1991-03-31 -0.0355 0.0239 0.1083 0.0751 -0.0209 -0.1155 0.1674 0.0681 0.0406
## 1991-04-30 -0.0311 0.0162 0.0648 0.0225 0.0535 -0.0955 -0.0143 0.1799 -0.0276
## 1991-05-31 0.0076 0.0992 0.1457 0.0611 0.1320 0.0425 -0.0368 0.0427 0.0853
## MCD MMM MRK MSFT NKE PFE PG TRV
## 1990-12-31 NA NA NA NA NA NA NA NA
## 1991-01-31 -0.0215 -0.01020 0.0181 0.3040 0.18634 0.1130 -0.07940 0.0159
## 1991-02-28 0.1096 0.05170 0.1161 0.0573 0.00523 0.1759 0.02524 0.0392
## 1991-03-31 0.1017 0.00000 0.0387 0.0229 -0.03873 0.0190 0.04769 0.0610
## 1991-04-30 -0.0360 0.00706 0.0213 -0.0671 0.02981 0.0187 -0.00902 0.0126
## 1991-05-31 0.0476 0.07828 0.1032 0.1086 -0.16316 0.0865 0.01490 -0.0816
## UTX VZ WMT XOM T
## 1990-12-31 NA NA NA NA NA
## 1991-01-31 -0.00783 -0.0852 0.0909 -0.00242 -0.0551
## 1991-02-28 0.05728 0.0103 0.0720 0.08194 0.0287
## 1991-03-31 -0.02513 0.0485 0.0967 0.06122 0.0442
## 1991-04-30 -0.05155 -0.0147 0.0452 0.01709 -0.0368
## 1991-05-31 0.02958 -0.0575 0.0586 -0.00974 -0.0304
# Remove the first row of returns
returns <- returns[-1, c("AAPL", "MSFT")]
# Create the weights
eq_weights <- c(0.5, 0.5)
# Create a portfolio using buy and hold
pf_bh <- Return.portfolio(R = returns, weights = eq_weights)
# Create a portfolio rebalancing monthly
pf_rebal <- Return.portfolio(R = returns, weights = eq_weights, rebalance_on="months")
# Plot the time-series
par(mfrow = c(2, 1), mar = c(2, 4, 2, 2))
plot.zoo(pf_bh)
plot.zoo(pf_rebal)
# Create the weights
eq_weights <- c(0.5, 0.5)
# Create a portfolio using buy and hold
pf_bh <- Return.portfolio(returns, weights = eq_weights, verbose=TRUE)
# Create a portfolio that rebalances monthly
pf_rebal <- Return.portfolio(returns, weights = eq_weights, rebalance_on = "months", verbose=TRUE)
# Create eop_weight_bh
eop_weight_bh <- pf_bh$EOP.Weight
# Create eop_weight_rebal
eop_weight_rebal <- pf_rebal$EOP.Weight
# Plot end of period weights
par(mfrow = c(2, 1), mar=c(2, 4, 2, 2))
plot.zoo(eop_weight_bh$AAPL)
plot.zoo(eop_weight_rebal$AAPL)
Chapter 2 - Analyzing Performance
Dimensions of Portfolio Performance:
Annualized Sharpe Ratio:
Time-Variation in Portfolio Performance:
Non-Normality of Return Distributions:
Example code includes:
adjClose <- c(211.3, 211.8, 226.9, 238.9, 235.5, 247.4, 250.8, 236.1, 252.9, 231.3, 244, 249.2, 242.2, 274.1, 284.2, 291.7, 288.4, 290.1, 304, 318.7, 329.8, 321.8, 251.8, 230.3, 247.1, 257.1, 267.8, 258.9, 261.3, 262.2, 273.5, 272, 261.5, 271.9, 279, 273.7, 277.7, 297.5, 288.9, 294.9, 309.6, 320.5, 318, 346.1, 351.5, 349.1, 340.4, 346, 353.4, 329.1, 331.9, 339.9, 330.8, 361.2, 358, 356.1, 322.6, 306, 304, 322.2, 330.2, 343.9, 367.1, 375.2, 375.3, 389.8, 371.2, 387.8, 395.4, 387.9, 392.5, 375.2, 417.1, 408.8, 412.7, 403.7, 415, 415.4, 408.1, 424.2, 414, 417.8, 418.7, 431.4, 435.7, 438.8, 443.4, 451.7, 440.2, 450.2, 450.5, 448.1, 463.6, 458.9, 467.8, 461.8, 466.5, 481.6, 467.1, 445.8, 450.9, 456.5, 444.3, 458.3, 475.5, 462.7, 472.4, 453.7, 459.3, 470.4, 487.4, 500.7, 514.7, 533.4, 544.8, 562.1, 561.9, 584.4, 581.5, 605.4, 615.9, 636, 640.4, 645.5, 654.2, 669.1, 670.6, 640, 652, 687.3, 705.3, 757, 740.7, 786.2, 790.8, 757.1, 801.3, 848.3, 885.1, 954.3, 899.5, 947.3, 914.6, 955.4, 970.4, 980.3, 1049.3, 1101.8, 1111.8, 1090.8, 1133.8, 1120.7, 957.3, 1017, 1098.7, 1163.6, 1229.2, 1279.6, 1238.3, 1286.4, 1335.2, 1301.8, 1372.7, 1328.7, 1320.4, 1282.7, 1362.9, 1388.9, 1469.2, 1394.5, 1366.4, 1498.6, 1452.4, 1420.6, 1454.6, 1430.8, 1517.7, 1436.5, 1429.4, 1314.9, 1320.3, 1366, 1239.9, 1160.3, 1249.5, 1255.8, 1224.4, 1211.2, 1133.6, 1040.9, 1059.8, 1139.4, 1148.1, 1130.2, 1106.7, 1147.4, 1076.9, 1067.1, 989.8, 911.6, 916.1, 815.3, 885.8, 936.3, 879.8, 855.7, 841.2, 848.2, 916.9, 963.6, 974.5, 990.3, 1008, 996, 1050.7, 1058.2, 1111.9, 1131.1, 1144.9, 1126.2, 1107.3, 1120.7, 1140.8, 1101.7, 1104.2, 1114.6, 1130.2, 1173.8, 1211.9, 1181.3, 1203.6, 1180.6, 1156.8, 1191.5, 1191.3, 1234.2, 1220.3, 1228.8, 1207, 1249.5, 1248.3, 1280.1, 1280.7, 1294.9, 1310.6, 1270.1, 1270.2, 1276.7, 1303.8, 1335.8, 1377.9, 1400.6, 1418.3, 1438.2, 1406.8, 1420.9, 1482.4, 1530.6, 1503.3, 1455.3, 1474, 1526.8, 1549.4, 1481.1, 1468.4, 1378.6, 1330.6, 1322.7, 1385.6, 1400.4, 1280, 1267.4, 1282.8, 1166.4, 968.8, 896.2, 903.2, 825.9, 735.1, 797.9, 872.8, 919.1, 919.3, 987.5, 1020.6, 1057.1, 1036.2, 1095.6, 1115.1, 1073.9, 1104.5, 1169.4, 1186.7, 1089.4, 1030.7, 1101.6, 1049.3, 1141.2, 1183.3, 1180.6, 1257.6, 1286.1, 1327.2, 1325.8, 1363.6, 1345.2, 1320.6, 1292.3, 1218.9, 1131.4, 1253.3, 1247, 1257.6, 1312.4, 1365.7, 1408.5, 1397.9, 1310.3, 1362.2, 1379.3, 1406.6, 1440.7, 1412.2, 1416.2, 1426.2, 1498.1, 1514.7, 1569.2, 1597.6, 1630.7, 1606.3, 1685.7, 1633, 1681.6, 1756.5, 1805.8, 1848.4, 1782.6, 1859.4, 1872.3, 1883.9, 1923.6, 1960.2, 1930.7, 2003.4, 1972.3, 2018.1, 2067.6, 2058.9, 1995, 2104.5, 2067.9, 2085.5, 2107.4, 2063.1, 2103.8, 1972.2, 1920, 2079.4, 2080.4, 2043.9, 1940.2, 1932.2, 2059.7, 2065.3, 2096.9, 2098.9, 2173.6, 2170.9)
monDate <- c(5843, 5845, 5877, 5905, 5934, 5964, 5996, 6025, 6056, 6088, 6117, 6150, 6178, 6210, 6241, 6269, 6299, 6329, 6360, 6390, 6423, 6452, 6482, 6514, 6543, 6577, 6605, 6634, 6668, 6696, 6726, 6756, 6787, 6818, 6850, 6879, 6909, 6942, 6971, 6999, 7032, 7060, 7091, 7123, 7152, 7183, 7214, 7244, 7274, 7306, 7336, 7364, 7396, 7425, 7456, 7487, 7517, 7551, 7578, 7609, 7641, 7671, 7701, 7729, 7760, 7790, 7823, 7851, 7882, 7915, 7943, 7974, 8005, 8036, 8068, 8096, 8126, 8156, 8187, 8217, 8250, 8279, 8309, 8341, 8370, 8404, 8432, 8460, 8491, 8523, 8552, 8582, 8614, 8644, 8674, 8705, 8735, 8768, 8797, 8825, 8859, 8887, 8917, 8947, 8978, 9009, 9041, 9070, 9100, 9133, 9162, 9190, 9223, 9251, 9282, 9314, 9343, 9374, 9405, 9435, 9465, 9497, 9527, 9556, 9587, 9617, 9650, 9678, 9709, 9742, 9770, 9801, 9832, 9863, 9895, 9923, 9952, 9982, 10014, 10043, 10074, 10106, 10135, 10168, 10196, 10228, 10259, 10287, 10317, 10347, 10378, 10408, 10441, 10470, 10500, 10532, 10561, 10595, 10623, 10651, 10682, 10714, 10743, 10773, 10805, 10835, 10865, 10896, 10926, 10959, 10988, 11017, 11050, 11078, 11109, 11141, 11170, 11201, 11232, 11262, 11292, 11324, 11354, 11382, 11414, 11443, 11474, 11505, 11535, 11569, 11596, 11627, 11659, 11689, 11719, 11747, 11778, 11808, 11841, 11869, 11900, 11933, 11961, 11992, 12023, 12054, 12086, 12114, 12143, 12173, 12205, 12234, 12265, 12297, 12326, 12359, 12387, 12419, 12450, 12478, 12509, 12541, 12570, 12600, 12632, 12662, 12692, 12723, 12753, 12786, 12815, 12843, 12874, 12905, 12935, 12965, 12996, 13027, 13059, 13088, 13118, 13151, 13180, 13208, 13241, 13269, 13300, 13332, 13361, 13392, 13423, 13453, 13483, 13516, 13545, 13573, 13605, 13634, 13665, 13696, 13726, 13760, 13787, 13818, 13850, 13880, 13910, 13941, 13970, 14000, 14032, 14061, 14092, 14124, 14153, 14186, 14214, 14246, 14277, 14305, 14335, 14365, 14396, 14426, 14459, 14488, 14518, 14550, 14579, 14613, 14641, 14669, 14700, 14732, 14761, 14791, 14823, 14853, 14883, 14914, 14944, 14977, 15006, 15034, 15065, 15096, 15126, 15156, 15187, 15218, 15250, 15279, 15309, 15342, 15371, 15400, 15432, 15461, 15492, 15523, 15553, 15587, 15614, 15645, 15677, 15707, 15737, 15765, 15796, 15826, 15859, 15887, 15918, 15951, 15979, 16010, 16041, 16072, 16104, 16132, 16161, 16191, 16223, 16252, 16283, 16315, 16344, 16377, 16405, 16437, 16468, 16496, 16526, 16556, 16587, 16617, 16650, 16679, 16709, 16741, 16770, 16804, 16832, 16861, 16892, 16923, 16953, 16983, 17014)
sp500 <- xts(adjClose, order.by=as.Date(monDate))
names(sp500) <- "AdjClose"
str(sp500)
## An 'xts' object on 1985-12-31/2016-08-01 containing:
## Data: num [1:369, 1] 211 212 227 239 236 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr "AdjClose"
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## NULL
par(mfcol=c(1, 1))
par(mfrow=c(1, 1))
# Convert the daily frequency of sp500 to monthly frequency: sp500_monthly
sp500_monthly <- to.monthly(sp500)
# Print the first six rows of sp500_monthly
head(sp500_monthly)
## sp500.Open sp500.High sp500.Low sp500.Close
## Dec 1985 211 211 211 211
## Jan 1986 212 212 212 212
## Feb 1986 227 227 227 227
## Mar 1986 239 239 239 239
## Apr 1986 236 236 236 236
## May 1986 247 247 247 247
# Create sp500_returns using Return.calculate using the closing prices
sp500_returns <- Return.calculate(sp500_monthly[, 4])
# Time series plot
plot.zoo(sp500_returns)
# Produce the year x month table
table.CalendarReturns(sp500_returns)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 1985 NA NA NA NA NA NA NA NA NA NA NA 0.2
## 1986 7.1 5.3 -1.4 5.1 1.4 -5.9 7.1 -8.5 5.5 2.1 -2.8 13.2
## 1987 3.7 2.6 -1.1 0.6 4.8 4.8 3.5 -2.4 -21.8 -8.5 7.3 4.0
## 1988 4.2 -3.3 0.9 0.3 4.3 -0.5 -3.9 4.0 2.6 -1.9 1.5 7.1
## 1989 -2.9 2.1 5.0 3.5 -0.8 8.8 1.6 -0.7 -2.5 1.6 2.1 -6.9
## 1990 0.9 2.4 -2.7 9.2 -0.9 -0.5 -9.4 -5.1 -0.7 6.0 2.5 4.1
## 1991 6.7 2.2 0.0 3.9 -4.8 4.5 2.0 -1.9 1.2 -4.4 11.2 -2.0
## 1992 1.0 -2.2 2.8 0.1 -1.8 3.9 -2.4 0.9 0.2 3.0 1.0 0.7
## 1993 1.0 1.9 -2.5 2.3 0.1 -0.5 3.5 -1.0 1.9 -1.3 1.0 3.2
## 1994 -3.0 -4.6 1.1 1.2 -2.7 3.2 3.8 -2.7 2.1 -4.0 1.2 2.4
## 1995 3.6 2.7 2.8 3.6 2.1 3.2 0.0 4.0 -0.5 4.1 1.7 3.3
## 1996 0.7 0.8 1.3 2.3 0.2 -4.6 1.9 5.4 2.6 7.3 -2.2 6.1
## 1997 0.6 -4.3 5.8 5.9 4.3 7.8 -5.7 5.3 -3.5 4.5 1.6 1.0
## 1998 7.0 5.0 0.9 -1.9 3.9 -1.2 -14.6 6.2 8.0 5.9 5.6 4.1
## 1999 -3.2 3.9 3.8 -2.5 5.4 -3.2 -0.6 -2.9 6.3 1.9 5.8 -5.1
## 2000 -2.0 9.7 -3.1 -2.2 2.4 -1.6 6.1 -5.4 -0.5 -8.0 0.4 3.5
## 2001 -9.2 -6.4 7.7 0.5 -2.5 -1.1 -6.4 -8.2 1.8 7.5 0.8 -1.6
## 2002 -2.1 3.7 -6.1 -0.9 -7.2 -7.9 0.5 -11.0 8.6 5.7 -6.0 -2.7
## 2003 -1.7 0.8 8.1 5.1 1.1 1.6 1.8 -1.2 5.5 0.7 5.1 1.7
## 2004 1.2 -1.6 -1.7 1.2 1.8 -3.4 0.2 0.9 1.4 3.9 3.2 -2.5
## 2005 1.9 -1.9 -2.0 3.0 0.0 3.6 -1.1 0.7 -1.8 3.5 -0.1 2.5
## 2006 0.0 1.1 1.2 -3.1 0.0 0.5 2.1 2.5 3.2 1.6 1.3 1.4
## 2007 -2.2 1.0 4.3 3.3 -1.8 -3.2 1.3 3.6 1.5 -4.4 -0.9 -6.1
## 2008 -3.5 -0.6 4.8 1.1 -8.6 -1.0 1.2 -9.1 -16.9 -7.5 0.8 -8.6
## 2009 -11.0 8.5 9.4 5.3 0.0 7.4 3.4 3.6 -2.0 5.7 1.8 -3.7
## 2010 2.8 5.9 1.5 -8.2 -5.4 6.9 -4.7 8.8 3.7 -0.2 6.5 2.3
## 2011 3.2 -0.1 2.9 -1.3 -1.8 -2.1 -5.7 -7.2 10.8 -0.5 0.9 4.4
## 2012 4.1 3.1 -0.8 -6.3 4.0 1.3 2.0 2.4 -2.0 0.3 0.7 5.0
## 2013 1.1 3.6 1.8 2.1 -1.5 4.9 -3.1 3.0 4.5 2.8 2.4 -3.6
## 2014 4.3 0.7 0.6 2.1 1.9 -1.5 3.8 -1.6 2.3 2.5 -0.4 -3.1
## 2015 5.5 -1.7 0.9 1.1 -2.1 2.0 -6.3 -2.6 8.3 0.0 -1.8 -5.1
## 2016 -0.4 6.6 0.3 1.5 0.1 3.6 -0.1 NA NA NA NA NA
## sp500.Close
## 1985 0.2
## 1986 29.4
## 1987 -6.2
## 1988 15.7
## 1989 10.6
## 1990 4.5
## 1991 18.9
## 1992 7.3
## 1993 9.8
## 1994 -2.3
## 1995 35.2
## 1996 23.6
## 1997 24.7
## 1998 30.5
## 1999 9.0
## 2000 -2.0
## 2001 -17.3
## 2002 -24.3
## 2003 32.2
## 2004 4.4
## 2005 8.4
## 2006 12.4
## 2007 -4.1
## 2008 -40.1
## 2009 30.0
## 2010 19.8
## 2011 2.0
## 2012 14.1
## 2013 19.0
## 2014 11.9
## 2015 -2.7
## 2016 11.9
sp500_returns <- sp500_returns[2:nrow(sp500_returns), ]
str(sp500_returns)
## An 'xts' object on Jan 1986/Aug 2016 containing:
## Data: num [1:368, 1] 0.00237 0.07129 0.05289 -0.01423 0.05053 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr "sp500.Close"
## Indexed by objects of class: [yearmon] TZ: UTC
## xts Attributes:
## List of 2
## $ ret_type : chr "discrete"
## $ coredata_content: chr "discreteReturn"
# Compute the mean monthly returns
mean(sp500_returns)
## [1] 0.00732
# Compute the geometric mean of monthly returns
mean.geometric(sp500_returns)
## sp500.Close
## Geometric Mean 0.00635
# Compute the standard deviation
sd(sp500_returns)
## [1] 0.0436
rfRates <- c(0.0053, 0.006, 0.0052, 0.0049, 0.0052, 0.0052, 0.0046, 0.0045, 0.0046, 0.0039, 0.0049, 0.0042, 0.0043, 0.0047, 0.0044, 0.0038, 0.0048, 0.0046, 0.0047, 0.0045, 0.006, 0.0035, 0.0039, 0.0029, 0.0046, 0.0044, 0.0046, 0.0051, 0.0049, 0.0051, 0.0059, 0.0062, 0.0061, 0.0057, 0.0063, 0.0055, 0.0061, 0.0067, 0.0067, 0.0079, 0.0071, 0.007, 0.0074, 0.0065, 0.0068, 0.0069, 0.0061, 0.0057, 0.0057, 0.0064, 0.0069, 0.0068, 0.0063, 0.0068, 0.0066, 0.006, 0.0068, 0.0057, 0.006, 0.0052, 0.0048, 0.0044, 0.0053, 0.0047, 0.0042, 0.0049, 0.0046, 0.0046, 0.0042, 0.0039, 0.0038, 0.0034, 0.0028, 0.0034, 0.0032, 0.0028, 0.0032, 0.0031, 0.0026, 0.0026, 0.0023, 0.0023, 0.0028, 0.0023, 0.0022, 0.0025, 0.0024, 0.0022, 0.0025, 0.0024, 0.0025, 0.0026, 0.0022, 0.0025, 0.0023, 0.0025, 0.0021, 0.0027, 0.0027, 0.0031, 0.0031, 0.0028, 0.0037, 0.0037, 0.0038, 0.0037, 0.0044, 0.0042, 0.004, 0.0046, 0.0044, 0.0054, 0.0047, 0.0045, 0.0047, 0.0043, 0.0047, 0.0042, 0.0049, 0.0043, 0.0039, 0.0039, 0.0046, 0.0042, 0.004, 0.0045, 0.0041, 0.0044, 0.0042, 0.0041, 0.0046, 0.0045, 0.0039, 0.0043, 0.0043, 0.0049, 0.0037, 0.0043, 0.0041, 0.0044, 0.0042, 0.0039, 0.0048, 0.0043, 0.0039, 0.0039, 0.0043, 0.004, 0.0041, 0.004, 0.0043, 0.0046, 0.0032, 0.0031, 0.0038, 0.0035, 0.0035, 0.0043, 0.0037, 0.0034, 0.004, 0.0038, 0.0039, 0.0039, 0.0039, 0.0036, 0.0044, 0.0041, 0.0043, 0.0047, 0.0046, 0.005, 0.004, 0.0048, 0.005, 0.0051, 0.0056, 0.0051, 0.005, 0.0054, 0.0038, 0.0042, 0.0039, 0.0032, 0.0028, 0.003, 0.0031, 0.0028, 0.0022, 0.0017, 0.0015, 0.0014, 0.0013, 0.0013, 0.0015, 0.0014, 0.0013, 0.0015, 0.0014, 0.0014, 0.0014, 0.0012, 0.0011, 0.001, 9e-04, 0.001, 0.001, 9e-04, 0.001, 7e-04, 7e-04, 8e-04, 7e-04, 7e-04, 8e-04, 7e-04, 6e-04, 9e-04, 8e-04, 6e-04, 8e-04, 0.001, 0.0011, 0.0011, 0.0011, 0.0015, 0.0016, 0.0016, 0.0016, 0.0021, 0.0021, 0.0024, 0.0023, 0.0024, 0.003, 0.0029, 0.0027, 0.0031, 0.0032, 0.0035, 0.0034, 0.0037, 0.0036, 0.0043, 0.004, 0.004, 0.0042, 0.0041, 0.0041, 0.0042, 0.004, 0.0044, 0.0038, 0.0043, 0.0044, 0.0041, 0.004, 0.004, 0.0042, 0.0032, 0.0032, 0.0034, 0.0027, 0.0021, 0.0013, 0.0017, 0.0018, 0.0018, 0.0017, 0.0015, 0.0013, 0.0015, 8e-04, 3e-04, 0, 0, 1e-04, 2e-04, 1e-04, 0, 1e-04, 1e-04, 1e-04, 1e-04, 0, 0, 1e-04, 0, 0, 1e-04, 1e-04, 1e-04, 1e-04, 1e-04, 1e-04, 1e-04, 1e-04, 1e-04, 1e-04, 1e-04, 1e-04, 1e-04, 0, 0, 0, 0, 1e-04, 0, 0, 0, 0, 0, 0, 0, 0, 1e-04, 0, 0, 1e-04, 1e-04, 1e-04, 1e-04, 1e-04, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1e-04, 1e-04, 2e-04, 2e-04, 1e-04, 1e-04, 2e-04, 2e-04, 2e-04, 2e-04)
rf <- xts(rfRates, order.by=index(sp500_returns))
str(rf)
## An 'xts' object on Jan 1986/Aug 2016 containing:
## Data: num [1:368, 1] 0.0053 0.006 0.0052 0.0049 0.0052 0.0052 0.0046 0.0045 0.0046 0.0039 ...
## Indexed by objects of class: [yearmon] TZ: UTC
## xts Attributes:
## NULL
# Compute the annualized risk free rate
annualized_rf <- (1 + rf)^12 - 1
# Plot the annualized risk free rate
plot.zoo(annualized_rf)
# Compute the series of excess portfolio returns
sp500_excess <- sp500_returns - rf
# Compare the mean
mean(sp500_returns)
## [1] 0.00732
mean(sp500_excess)
## [1] 0.00458
# Compute the Sharpe ratio
sp500_sharpe <- mean(sp500_excess) / sd(sp500_returns)
# Compute the annualized mean
Return.annualized(sp500_returns, scale=12)
## sp500.Close
## Annualized Return 0.0789
# Compute the annualized standard deviation
StdDev.annualized(sp500_returns)
## sp500.Close
## Annualized Standard Deviation 0.151
# Compute the annualized Sharpe ratio: ann_sharpe
ann_sharpe <- Return.annualized(sp500_returns, scale=12) / StdDev.annualized(sp500_returns)
# Compute all of the above at once using table.AnnualizedReturns()
table.AnnualizedReturns(sp500_returns)
## sp500.Close
## Annualized Return 0.0789
## Annualized Std Dev 0.1512
## Annualized Sharpe (Rf=0%) 0.5221
# In this exercise you will also use the function SharpeRatio.annualized() to calculate the annualized Sharpe Ratio
# This function takes the arguments R, and Rf
# The R argument takes an xts, vector, matrix, data.frame, timeSeries or zoo object of asset returns
# The Rf argument is necessary in SharpeRatio.annualized(), as it takes into account the risk-free rate in the same period of your returns
# For this example you can use the object rf to fulfill the Rf argument
# The function chart.RollingPerformance() makes it easy to visualize the rolling estimates of performance in R
# Familiarize yourself first with the syntax of this function
# It requires you to specify the time series of portfolio returns (by setting the argument R), the length of the window (width) and the function used to compute the performance (argument FUN)
# To see all three plots together, PerformanceAnalytics provides a shortcut function charts.RollingPerformance()
# Note the charts instead of chart. This function creates all of the previous charts at once and does not use the argument FUN
# Calculate the mean, volatility, and sharpe ratio of sp500_returns
returns_ann <- Return.annualized(sp500_returns, scale=12)
sd_ann <- StdDev.annualized(sp500_returns)
sharpe_ann <- SharpeRatio.annualized(sp500_returns, Rf=rf)
# Plotting the 12-month rolling annualized mean
chart.RollingPerformance(R = sp500_returns, width = 12, scale=12, FUN = "Return.annualized")
abline(h = returns_ann)
# Plotting the 12-month rolling annualized standard deviation
chart.RollingPerformance(R = sp500_returns, width=12, FUN="StdDev.annualized")
abline(h = sd_ann)
# Plotting the 12-month rolling annualized Sharpe ratio
chart.RollingPerformance(R = sp500_returns, width=12, FUN="SharpeRatio.annualized", Rf=rf)
abline(h = sharpe_ann)
# Fill in window for 2008
sp500_2008 <- window(sp500_returns, start = "2008-01-01", end = "2008-12-31")
# Create window for 2014
sp500_2014 <- window(sp500_returns, start = "2014-01-01", end = "2014-12-31")
# Plotting settings
par(mfrow = c(1, 2) , mar=c(3, 2, 2, 2))
names(sp500_2008) <- "sp500_2008"
names(sp500_2014) <- "sp500_2014"
# Plot histogram of 2008
chart.Histogram(sp500_2008, methods = c("add.density", "add.normal"))
# Plot histogram of 2014
chart.Histogram(sp500_2014, methods = c("add.density", "add.normal"))
# Compute the skewness
# skewness(sp500_daily)
skewness(sp500_returns)
## [1] -0.791
# Compute the excess kurtois
# kurtosis(sp500_daily)
kurtosis(sp500_returns)
## [1] 2.43
# Calculate the SemiDeviation
SemiDeviation(sp500_returns)
## sp500.Close
## Semi-Deviation 0.0333
# Calculate the value at risk
VaR(sp500_returns, p=0.025)
## sp500.Close
## VaR -0.0977
VaR(sp500_returns, p=0.05)
## sp500.Close
## VaR -0.0715
# Calculate the expected shortfall
ES(sp500_returns, p=0.025)
## sp500.Close
## ES -0.165
ES(sp500_returns, p=0.05)
## sp500.Close
## ES -0.118
par(mfcol=c(1, 1))
par(mfrow=c(1, 1))
index(sp500_returns) <- as.Date(index(sp500_returns))
# Table of drawdowns
table.Drawdowns(sp500_returns)
## From Trough To Depth Length To Trough Recovery
## 1 2007-11-01 2009-02-01 2013-03-01 -0.526 65 16 49
## 2 2000-09-01 2002-09-01 2007-05-01 -0.463 81 25 56
## 3 1987-09-01 1987-11-01 1989-07-01 -0.302 23 3 20
## 4 1990-06-01 1990-10-01 1991-02-01 -0.158 9 5 4
## 5 1998-07-01 1998-08-01 1998-11-01 -0.156 5 2 3
# Plot of drawdowns
chart.Drawdown(sp500_returns)
par(mfcol=c(1, 1))
par(mfrow=c(1, 1))
Chapter 3 - Performance Drivers
Drivers in the Case of Two Assets:
Using Matrix Notation:
Portfolio Risk Budget:
Example code includes:
# load("./RInputFiles/eq_prices.RData")
# load("./RInputFiles/bond_prices.RData")
retEQ <- c(0.02182, 0.027, 0.03152, 0.01989, 0.01337, 0.01504, -0.01962, 0.01159, 0.0443, 0.03392, -0.01462, -0.03131, 0.01283, 0.03871, 0.01357, -0.03873, -0.01126, -0.06046, -0.02584, -0.00894, 0.04766, 0.01512, -0.08358, -0.00899, 0.01545, -0.09417, -0.16519, -0.06961, 0.0098, -0.08211, -0.10745, 0.08331, 0.09935, 0.05845, -0.00065, 0.07461, 0.03694, 0.03546, -0.01923, 0.06161, 0.0191, -0.03634, 0.03119, 0.06088, 0.01547, -0.07945, -0.05174, 0.0683, -0.04498, 0.08955, 0.0382, 0, 0.06685, 0.0233, 0.03474, 0.00012, 0.02896, -0.01121, -0.01687, -0.02, -0.05498, -0.06942, 0.10915, -0.00406, 0.01045, 0.04637, 0.04341, 0.03216, -0.00668, -0.06006, 0.04058, 0.01183, 0.02505, 0.02535, -0.0182, 0.00566, 0.00893, 0.05119, 0.01276, 0.03797, 0.01921, 0.02361, -0.01334, 0.05168, -0.02999, 0.03165, 0.04631, 0.02964, 0.02593, -0.03525, 0.04552, 0.0083, 0.00695, 0.02321, 0.02064, -0.01344, 0.03946, -0.0138, 0.02355, 0.02747, -0.00254, -0.02963, 0.0562, -0.01571, 0.00983, 0.01286, -0.02031, 0.02259, -0.06095, -0.02552, 0.08506, 0.00366, -0.01728, -0.04979, -0.00083, 0.06727, 0.00394, 0.01701, 0.00348, 0.03647, -0.00083)
retBonds <- c(0.01591, 0.01012, 0.00686, 0.01062, -0.006, -7e-04, 0.01625, -0.00195, 0.0059, -0.00917, -0.00377, 0.01073, 0.01279, 0.00633, 0.01008, 0.01804, -9e-05, 0.02313, -0.0016, 0.0013, 0.00328, -0.01265, -0.00185, 0.00394, 0.00742, -0.0176, -0.02281, 0.03024, 0.06662, -0.01987, -0.01055, 0.01103, 0.00515, 0.0071, 0.00421, 0.01269, 0.0122, 0.01188, 0.00222, 0.01286, -0.01881, 0.01415, 0.00198, -7e-05, 0.00973, 0.01082, 0.01769, 0.00855, 0.01287, 7e-05, 0.00152, -0.00835, -0.00675, -0.00085, 0.0029, -0.00223, 0.01567, 0.0124, -0.00445, 0.01687, 0.0152, 0.00771, 0.00126, -0.00331, 0.01357, 0.00726, -0.00014, -0.00573, 0.00906, 0.01078, -2e-04, 0.01362, 0, 0.00267, -0.00047, 0.00271, -0.00245, -0.00621, 0.00591, 0.00099, 0.00969, -0.02001, -0.01565, 0.00269, -0.00826, 0.01121, 0.00833, -0.00251, -0.00557, 0.01541, 0.00376, -0.00148, 0.00822, 0.01179, -0.00058, -0.00251, 0.0115, -0.00615, 0.01064, 0.00657, 0.00148, 0.02052, -0.00895, 0.00375, -0.00323, -0.00437, -0.01077, 0.00862, -0.00336, 0.00811, 0.00069, -0.00389, -0.00192, 0.01241, 0.00887, 0.00875, 0.00255, 0.00014, 0.01935, 0.00544, -0.00233)
idxDates <- as.Date(c(13390, 13420, 13451, 13481, 13512, 13543, 13571, 13602, 13632, 13663, 13693, 13724, 13755, 13785, 13816, 13846, 13877, 13908, 13937, 13968, 13998, 14029, 14059, 14090, 14121, 14151, 14182, 14212, 14243, 14274, 14302, 14333, 14363, 14394, 14424, 14455, 14486, 14516, 14547, 14577, 14608, 14639, 14667, 14698, 14728, 14759, 14789, 14820, 14851, 14881, 14912, 14942, 14973, 15004, 15032, 15063, 15093, 15124, 15154, 15185, 15216, 15246, 15277, 15307, 15338, 15369, 15398, 15429, 15459, 15490, 15520, 15551, 15582, 15612, 15643, 15673, 15704, 15735, 15763, 15794, 15824, 15855, 15885, 15916, 15947, 15977, 16008, 16038, 16069, 16100, 16128, 16159, 16189, 16220, 16250, 16281, 16312, 16342, 16373, 16403, 16434, 16465, 16493, 16524, 16554, 16585, 16615, 16646, 16677, 16707, 16738, 16768, 16799, 16830, 16859, 16890, 16920, 16951, 16981, 17012, 17043))
returns_equities <- xts(retEQ, order.by=idxDates)
names(returns_equities) <- "equities"
str(returns_equities)
## An 'xts' object on 2006-08-30/2016-08-30 containing:
## Data: num [1:121, 1] 0.0218 0.027 0.0315 0.0199 0.0134 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr "equities"
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## NULL
returns_bonds <- xts(retBonds, order.by=idxDates)
names(returns_bonds) <- "bonds"
str(returns_bonds)
## An 'xts' object on 2006-08-30/2016-08-30 containing:
## Data: num [1:121, 1] 0.01591 0.01012 0.00686 0.01062 -0.006 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr "bonds"
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## NULL
# Create a grid
grid <- seq(from = 0, to = 1, by = 0.01)
# Initialize an empty vector for sharpe ratios
vsharpe <- rep(NA, times = length(grid))
# Create a for loop to calculate Sharpe ratios
for(i in 1:length(grid)) {
weight <- grid[i]
preturns <- weight * returns_equities + (1 - weight) * returns_bonds
vsharpe[i] <- SharpeRatio.annualized(preturns)
}
# Plot weights and Sharpe ratio
plot(grid, vsharpe, xlab = "Weights", ylab= "Ann. Sharpe ratio")
abline(v = grid[vsharpe == max(vsharpe)], lty = 3)
# Create a scatter plot
chart.Scatter(returns_bonds, returns_equities)
# Find the correlation
cor(returns_equities, returns_bonds)
## bonds
## equities 0.0621
# Merge returns_equities and returns_bonds
returns <- merge(returns_equities, returns_bonds)
# Find and visualize the correlation using chart.Correlation
chart.Correlation(returns)
# Visualize the rolling estimates using chart.RollingCorrelation
chart.RollingCorrelation(returns_equities, returns_bonds, width = 24)
retData <- c(0.0218, 0.027, 0.0315, 0.0199, 0.0134, 0.015, -0.0196, 0.0116, 0.0443, 0.0339, -0.0146, -0.0313, 0.0128, 0.0387, 0.0136, -0.0387, -0.0113, -0.0605, -0.0258, -0.0089, 0.0477, 0.0151, -0.0836, -0.009, 0.0155, -0.0942, -0.1652, -0.0696, 0.0098, -0.0821, -0.1074, 0.0833, 0.0993, 0.0585, -7e-04, 0.0746, 0.0369, 0.0355, -0.0192, 0.0616, 0.0191, -0.0363, 0.0312, 0.0609, 0.0155, -0.0795, -0.0517, 0.0683, -0.045, 0.0896, 0.0382, 0, 0.0669, 0.0233, 0.0347, 1e-04, 0.029, -0.0112, -0.0169, -0.02, -0.055, -0.0694, 0.1091, -0.0041, 0.0104, 0.0464, 0.0434, 0.0322, -0.0067, -0.0601, 0.0406, 0.0118, 0.0251, 0.0254, -0.0182, 0.0057, 0.0089, 0.0512, 0.0128, 0.038, 0.0192, 0.0236, -0.0133, 0.0517, -0.03, 0.0316, 0.0463, 0.0296, 0.0259, -0.0352, 0.0455, 0.0083, 0.007, 0.0232, 0.0206, -0.0134, 0.0395, -0.0138, 0.0236, 0.0275, -0.0025, -0.0296, 0.0562, -0.0157, 0.0098, 0.0129, -0.0203, 0.0226, -0.061, -0.0255, 0.0851, 0.0037, -0.0173, -0.0498, -8e-04, 0.0673, 0.0039, 0.017, 0.0035, 0.0365, -8e-04, 0.0159, 0.0101, 0.0069, 0.0106, -0.006, -7e-04, 0.0163, -0.002, 0.0059, -0.0092, -0.0038, 0.0107, 0.0128, 0.0063, 0.0101, 0.018, -1e-04, 0.0231, -0.0016, 0.0013, 0.0033, -0.0127, -0.0018, 0.0039, 0.0074, -0.0176, -0.0228, 0.0302, 0.0666, -0.0199, -0.0106, 0.011, 0.0052, 0.0071, 0.0042, 0.0127, 0.0122, 0.0119, 0.0022, 0.0129, -0.0188, 0.0141, 0.002, -1e-04, 0.0097, 0.0108, 0.0177, 0.0085, 0.0129, 1e-04, 0.0015, -0.0083, -0.0068, -9e-04, 0.0029, -0.0022, 0.0157, 0.0124, -0.0045, 0.0169, 0.0152, 0.0077, 0.0013, -0.0033, 0.0136, 0.0073, -1e-04, -0.0057, 0.0091, 0.0108, -2e-04, 0.0136, 0, 0.0027, -5e-04, 0.0027, -0.0025, -0.0062, 0.0059, 0.001, 0.0097, -0.02, -0.0157, 0.0027, -0.0083, 0.0112, 0.0083, -0.0025, -0.0056, 0.0154, 0.0038, -0.0015, 0.0082, 0.0118, -6e-04, -0.0025, 0.0115, -0.0061, 0.0106, 0.0066, 0.0015, 0.0205, -0.0089, 0.0037, -0.0032, -0.0044, -0.0108, 0.0086, -0.0034, 0.0081, 7e-04, -0.0039, -0.0019, 0.0124, 0.0089, 0.0087, 0.0025, 1e-04, 0.0194, 0.0054, -0.0023, 0.0348, 0.0197, 0.0603, 0.0478, -0.0184, 0.0878, -0.027, -0.0207, -0.0013, -0.0038, -0.0901, -0.0826, 0.0674, 0.0392, 0.021, -0.0947, -0.0541, -0.0078, -0.0318, 0.0649, 0.0638, -0.0021, -0.1062, 0.0312, 0.0237, -0.0012, -0.3173, -0.2272, 0.1671, -0.1753, -0.2053, 0.0364, 0.3068, 0.0262, -0.032, 0.1077, 0.1432, 0.0659, -0.0446, 0.0657, 0.0739, -0.0552, 0.0558, 0.1019, 0.0715, -0.0533, -0.0521, 0.0959, -0.0128, 0.0447, 0.0474, -0.0185, 0.0455, 0.0325, 0.0471, -0.0161, 0.0575, 0.0137, -0.033, 0.0156, -0.0562, -0.1084, 0.1429, -0.038, 0.0485, 0.0638, -0.0115, 0.052, 0.0286, -0.0451, 0.0552, 0.02, -1e-04, -0.0186, -0.0091, -0.0026, 0.0372, 0.0374, 0.0122, 0.0287, 0.0673, -0.0598, -0.0198, 0.009, -0.0698, 0.0348, 0.0452, -0.0525, 0.001, 0.0428, 0.0507, 0.005, 0.0329, 0.024, 0.0113, 8e-04, 0.0304, -0.0604, 0.0994, 0.02, 0.019, 0.0685, -0.0367, 0.0173, -0.0585, -0.003, -0.0467, 0.0577, -0.0629, 0.0306, 0.0576, -0.0063, 0.0183, -0.0344, -0.0036, 0.1047, -0.0235, 0.0225, 0.0693, 0.0426, 0.0022, -0.0713, -0.1074, -0.0254, 0.05, -0.0692, -0.0255, 0.0371, 0.0247, 0.0046, -0.0118, 0.0313, 0.0478, -0.0319, 0.0944, 0.06, -0.0046, 0.0597, -0.0046, 0.1126, -0.0094, 0.0819, 0.0847, 0.1032, -0.1332, -0.0748, -0.1044, -0.2967, -0.1488, -0.1118, -0.1035, -0.0612, 0.0457, -0.0155, 0.215, -0.0023, 0.0073, 0.0043, -0.0237, 0.0482, 0.0129, 0.0098, -0.0811, 0.0592, 0.0061, 0.0315, -0.136, 7e-04, 0.0576, -0.0548, 0.0803, 0.029, 0.0123, 0.0909, 0.0311, 0.0265, 0.0349, 0.0426, -0.0686, -0.0596, 0.0317, -0.0171, -0.1272, 0.1027, 0.015, -0.024, 0.024, 0.0622, -0.0304, -0.0035, -0.1278, 0.0069, 0.0588, 0.0717, -0.0214, -0.0402, 0.016, -0.0052, 0.0457, -0.051, 0.0111, -0.0489, -0.0153, -3e-04, 0.0565, 0.031, -0.0349, -0.0145, -0.0047, 0.0135, -0.0221, 0.048, -6e-04, 0.01, -0.0051, 0.0238, -0.0566, -0.0156, -0.0606, -0.0595, -0.101, -0.1375, -0.0871, 0.0594, -0.0656, 0.1092, -0.0217, -0.0024, -0.1416, 0.0028, -0.06, -6e-04, -0.0914, -0.0825, -0.0527, -0.0208, 0.0447, 0.1037, 0.0131, 0.0065, -0.1005, -0.0208)
retMtx <- matrix(retData, ncol=4, byrow=FALSE)
returns <- xts(retMtx, order.by=idxDates)
names(returns) <- c("equities", "bonds", "realestate", "commodities")
str(returns)
## An 'xts' object on 2006-08-30/2016-08-30 containing:
## Data: num [1:121, 1:4] 0.0218 0.027 0.0315 0.0199 0.0134 0.015 -0.0196 0.0116 0.0443 0.0339 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:4] "equities" "bonds" "realestate" "commodities"
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## NULL
par(mfcol=c(1, 1))
par(mfrow=c(1, 1))
# Create a vector of returns
means <- apply(returns, 2, "mean")
# Create a vector of standard deviation
sds <- apply(returns, 2, "sd")
# Create a scatter plot
plot(x=sds, y=means)
text(sds, means, labels = colnames(returns), cex = 0.7)
abline(h = 0, lty = 3)
sds <- unname(sapply(returns, 1, FUN=sd))
# Create a matrix with variances on the diagonal
diag_cov <- diag(sds**2)
# Create a covariance matrix of returns
cov_matrix <- cov(returns)
# Create a correlation matrix of returns
cor_matrix <- cor(returns)
# Verify covariances equal the product of standard deviations and correlation
all.equal(cov_matrix[1,2], cor_matrix[1,2] * sds[1] * sds[2])
## [1] TRUE
weights <- c(0.4, 0.4, 0.1, 0.1)
vmeans <- c(0.0070692, 0.0040095, 0.0089495, -0.0082961)
# Create a weight matrix w
w <- as.matrix(weights)
# Create a matrix of returns
mu <- as.matrix(vmeans)
# Calculate portfolio mean monthly returns
t(w) %*% mu
## [,1]
## [1,] 0.0045
# Calculate portfolio volatility (sigma is the covariance matrix)
# sqrt(t(w) %*% sigma %*% w)
sqrt(t(w) %*% cov_matrix %*% w)
## [,1]
## [1,] 0.0282
# Create portfolio weights
weights <- c(0.4, 0.4, 0.1, 0.1)
# Create volatility budget
vol_budget <- StdDev(returns, portfolio_method = "component", weights = weights)
# Make a table of weights and risk contribution
weights_percrisk <- cbind(weights, vol_budget$pct_contrib_StdDev)
colnames(weights_percrisk) <- c("weights", "perc vol contrib")
# Print the table
weights_percrisk
## weights perc vol contrib
## equities 0.4 0.5900
## bonds 0.4 0.0405
## realestate 0.1 0.2197
## commodities 0.1 0.1498
Chapter 4 - Optimizing the Portfolio
Modern Portfolio Theory of Harry Markowitz:
Efficient Frontier:
In-Sample vs. Out-Sample Evaluations:
Example code includes:
returns <- Return.calculate(prices)[-1, ]
str(returns)
## An 'xts' object on 1991-01-31/2015-12-31 containing:
## Data: num [1:300, 1:30] 0.12818 -0.00388 0.02144 0.03743 0.0537 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:30] "AA" "AAPL" "AXP" "BA" ...
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## List of 2
## $ ret_type : chr "discrete"
## $ coredata_content: chr "discreteReturn"
# Verify the class of returns
class(returns)
## [1] "xts" "zoo"
# Investigate the dimensions of returns
dim(returns)
## [1] 300 30
# Create a vector of row means
ew_preturns <- rowMeans(returns)
# Cast the numeric vector back to an xts object
ew_preturns <- xts(ew_preturns, order.by = time(returns))
# Plot ew_preturns
plot.zoo(ew_preturns)
# Load tseries
library(tseries)
##
## Attaching package: 'tseries'
## The following object is masked _by_ '.GlobalEnv':
##
## sharpe
# Create an optimized portfolio of returns
opt <- portfolio.optim(returns)
# Create pf_weights
pf_weights <- opt$pw
# Assign asset names
names(pf_weights) <- colnames(returns)
# Select optimum weights opt_weights
opt_weights <- pf_weights[pf_weights >= 0.01]
# Barplot of opt_weights
barplot(opt_weights)
# Print expected portfolio return and volatility
opt$pm
## [1] 0.0125
opt$ps
## [1] 0.0356
# The function portfolio.optim has arguments that allow for more general specifications. The arguments are as follows:
# portfolio.optim(x, pm = mean(x), shorts = FALSE, reshigh = NULL)
# The argument pm sets the target return, the argument reshigh specifies the upper constraints on the portfolio weights, and the argument shorts is a logical statement specifying whether negative weights are forbidden or not, by default shorts = FALSE
# Create portfolio with target return of average returns
pf_mean <- portfolio.optim(returns, pm = mean(returns))
# Create portfolio with target return 10% greater than average returns
pf_10plus <- portfolio.optim(returns, pm = 1.1 * mean(returns))
# Print the standard deviations of both portfolios
pf_mean$ps
## [1] 0.0356
pf_10plus$ps
## [1] 0.0384
# Calculate the proportion increase in standard deviation
(pf_10plus$ps - pf_mean$ps) / (pf_mean$ps)
## [1] 0.0798
# Create vectors of maximum weights
max_weights1 <- rep(1, ncol(returns))
max_weights2 <- rep(0.1, ncol(returns))
max_weights3 <- rep(0.05, ncol(returns))
# Create an optimum portfolio with max weights of 100%
opt1 <- portfolio.optim(returns, reshigh = max_weights1)
# Create an optimum portfolio with max weights of 10%
opt2 <- portfolio.optim(returns, reshigh = max_weights2)
# Create an optimum portfolio with max weights of 5%
opt3 <- portfolio.optim(returns, reshigh = max_weights3)
# Calculate how many assets have a weight that is greater than 1% for each portfolio
sum(opt1$pw > .01)
## [1] 15
sum(opt2$pw > .01)
## [1] 17
sum(opt3$pw > .01)
## [1] 22
# Print portfolio volatilites
opt1$ps
## [1] 0.0356
opt2$ps
## [1] 0.0362
opt3$ps
## [1] 0.038
# Calculate each stocks mean returns
stockmu <- colMeans(returns)
# Create a grid of target values
grid <- seq(from = 0.01, to = max(stockmu), length.out = 50)
# Create empty vectors to store means and deviations
vpm <- vpsd <- numeric(length(grid))
# Create an empty matrix to store weights
mweights <- matrix(NA, 50, 30)
# Create your for loop
for(i in 1:length(grid)) {
opt <- portfolio.optim(x = returns, pm = grid[i])
vpm[i] <- opt$pm
vpsd[i] <- opt$ps
mweights[i, ] <- opt$pw
}
# Create weights_minvar as the portfolio with the least risk
dimnames(mweights)[[2]] <- names(prices)
weights_minvar <- mweights[vpsd == min(vpsd), ]
# Calculate the Sharpe ratio
vsr <- (vpm - 0.0075) / vpsd
# Create weights_max_sr as the portfolio with the maximum Sharpe ratio
weights_max_sr <- mweights[vsr == max(vsr), ]
# Create barplot of weights_minvar and weights_max_sr
par(mfrow = c(2, 1), mar = c(3, 2, 2, 1))
barplot(weights_minvar[weights_minvar > 0.01])
barplot(weights_max_sr[weights_max_sr > 0.01])
par(mfrow = c(1, 1))
# Create returns_estim
returns_estim <- window(returns, start = "1991-01-01", end = "2003-12-31")
# Create returns_eval
returns_eval <- window(returns, start = "2004-01-01", end = "2015-12-31")
# Create vector of max weights
max_weights <- rep(0.1, ncol(returns))
# Create portfolio with estimation sample
pf_estim <- portfolio.optim(returns_estim, reshigh = max_weights)
# Create portfolio with evaluation sample
pf_eval <- portfolio.optim(returns_eval, reshigh = max_weights)
# Create a scatter plot
plot(pf_estim$pw, pf_eval$pw)
abline(h = 0, b = 1, lty = 3)
# Create returns_pf_estim
returns_pf_estim <- Return.portfolio(returns_estim, pf_estim$pw, rebalance_on = "months")
# Create returns_pf_eval
returns_pf_eval <- Return.portfolio(returns_eval, pf_estim$pw, rebalance_on = "months")
# Print a table for your estimation portfolio
table.AnnualizedReturns(returns_pf_estim)
## portfolio.returns
## Annualized Return 0.194
## Annualized Std Dev 0.133
## Annualized Sharpe (Rf=0%) 1.463
# Print a table for your evaluation portfolio
table.AnnualizedReturns(returns_pf_eval)
## portfolio.returns
## Annualized Return 0.0864
## Annualized Std Dev 0.1242
## Annualized Sharpe (Rf=0%) 0.6954
Chapter 1 - Introduction to Portfolio Theory
Introduction:
Challenges of portfolio optimization:
Introduction to PortfolioAnalytics:
Example code includes:
# Load the package
library(PortfolioAnalytics)
##
## Attaching package: 'PortfolioAnalytics'
## The following object is masked from 'package:igraph':
##
## constraint
library(ROI)
## Registered S3 method overwritten by 'ROI':
## method from
## print.constraint PortfolioAnalytics
## ROI: R Optimization Infrastructure
## Registered solver plugins: nlminb, glpk, quadprog, symphony.
## Default solver: auto.
##
## Attaching package: 'ROI'
## The following objects are masked from 'package:PortfolioAnalytics':
##
## is.constraint, objective
## The following object is masked from 'package:lavaan':
##
## vech
# Load the data
data(indexes)
# Subset the data
index_returns <- indexes[, 1:4]
# Print the head of the data
head(index_returns)
## Warning in tclass.xts(x): index does not have a 'tclass' attribute
## Warning in tzone.xts(x): index does not have a 'tzone' attribute
## Warning in tclass.xts(x): index does not have a 'tclass' attribute
## Warning in tzone.xts(x): index does not have a 'tzone' attribute
## Warning: index class is Date, which does not support timezones.
## Expected 'UTC' timezone, but tzone is ''
## Warning in tzone.xts(x): index does not have a 'tzone' attribute
## Warning in tzone.xts(x): index does not have a 'tzone' attribute
## Warning in tclass.xts(x): index does not have a 'tclass' attribute
## US Bonds US Equities Int'l Equities Commodities
## 1980-01-31 -0.0272 0.0610 0.0462 0.0568
## 1980-02-29 -0.0669 0.0031 -0.0040 -0.0093
## 1980-03-31 0.0053 -0.0987 -0.1188 -0.1625
## 1980-04-30 0.0992 0.0429 0.0864 0.0357
## 1980-05-31 0.0000 0.0562 0.0446 0.0573
## 1980-06-30 0.0605 0.0296 0.0600 0.0533
# The portfolio problem is to form a minimum variance portfolio subject to full investment and long only constraints
# The objective is to minimize portfolio variance
# There are two constraints in this problem: the full investment constraint means that the weights must sum to 1, and the long only constraint means that all weights must be greater than or equal to 0 (i.e. no short positions are allowed).
# Create the portfolio specification
port_spec <- portfolio.spec(colnames(index_returns))
# Add a full investment constraint such that the weights sum to 1
port_spec <- add.constraint(portfolio = port_spec, type = "full_investment")
# Add a long only constraint such that the weight of an asset is between 0 and 1
port_spec <- add.constraint(portfolio = port_spec, type = "long_only")
# Add an objective to minimize portfolio standard deviation
port_spec <- add.objective(portfolio = port_spec, type = "risk", name = "StdDev")
# Solve the optimization problem
opt <- optimize.portfolio(index_returns, portfolio = port_spec, optimize_method = "ROI")
## Warning in tclass.xts(x): index does not have a 'tclass' attribute
## Warning in tclass.xts(x): index does not have a 'tclass' attribute
# Print the results of the optimization
opt
## ***********************************
## PortfolioAnalytics Optimization
## ***********************************
##
## Call:
## optimize.portfolio(R = index_returns, portfolio = port_spec,
## optimize_method = "ROI")
##
## Optimal Weights:
## US Bonds US Equities Int'l Equities Commodities
## 0.8544 0.0587 0.0000 0.0869
##
## Objective Measure:
## StdDev
## 0.01668
# Extract the optimal weights
extractWeights(opt)
## US Bonds US Equities Int'l Equities Commodities
## 0.8544 0.0587 0.0000 0.0869
# Chart the optimal weights
chart.Weights(opt)
# Create the portfolio specification
port_spec <- portfolio.spec(assets = colnames(index_returns))
# Add a full investment constraint such that the weights sum to 1
port_spec <- add.constraint(portfolio = port_spec, type = "full_investment")
# Add a long only constraint such that the weight of an asset is between 0 and 1
port_spec <- add.constraint(portfolio = port_spec, type = "long_only")
# Add an objective to maximize portfolio mean return
port_spec <- add.objective(portfolio = port_spec, type = "return", name = "mean")
# Add an objective to minimize portfolio variance
port_spec <- add.objective(portfolio = port_spec, type = "risk", name = "var", risk_aversion = 10)
# Solve the optimization problem
opt <- optimize.portfolio(R = index_returns, portfolio = port_spec, optimize_method = "ROI")
## Warning in tclass.xts(x): index does not have a 'tclass' attribute
## Warning in tclass.xts(x): index does not have a 'tclass' attribute
Chapter 2 - Portfolio Optimization Workflow
Portfolio specification, constraints, and objectives:
Running optimizations:
Analyzing optimization results:
Example code includes:
retData <- c(0.0119, 0.0123, 0.0078, 0.0086, 0.0156, 0.0212, 0.0193, 0.0134, 0.0122, 0.01, 0, 0.0068, 0.0145, 0.0146, 0.0144, 0.0126, 0.0056, -6e-04, 0.006, -0.0319, -0.0196, -0.0214, 0.0269, 0.0113, 0.0219, 0.0082, 0.0136, 0.0243, 0.0166, 0.0102, 0.0101, 0.0048, 0.0096, 0.0045, 0.0124, 0.014, 0.0227, 0.0267, 0.0243, 0.0223, 0.0149, 0.0179, 0.0093, 0.0162, 0.0141, 0.0052, -0.0081, -2e-04, 0.0344, 0.0182, 0.0162, 0.0157, 0.0033, 0.0012, 0.0091, 0.0142, 0.0078, 0.0117, 0.008, -0.0094, 0.0148, -0.0049, 0.0053, 0.0096, 0.0033, 4e-04, -0.0159, 0.005, 0.0146, 0.0104, 0.0251, 0.0157, 0.0283, 0.0133, 0.0089, 0.015, 0.0136, -0.0058, -0.0072, -0.0087, 0.0171, 0.0146, 0.0092, 0.0054, 0.0119, 0.0017, 0.0061, 0.002, -0.0128, -0.0106, 0.0013, 0.004, -0.0017, -0.0044, 0.0081, 0.0056, -0.0096, -0.0058, -0.014, -0.0316, -0.0133, 0.0107, 0.0164, 0.0066, 0.0142, -0.0015, 4e-04, 0.0092, 0.025, 0.0116, 0.0107, 0.0064, 0.0091, 0.0012, 0.0066, 0.0098, 0.0093, 0.0054, 0.0092, 0.0127, 0.013, 0.0117, 0.006, 0.0026, 0.011, 0.0011, -0.0053, -0.0145, 0.0161, 0.0177, -0.0131, -0.0077, -9e-04, -0.0083, -0.0317, 0.0076, 0.0107, -0.0081, -0.0188, -0.0066, -0.1027, -0.1237, -0.0276, 0.0177, 0.0491, 0.0164, 0.0235, 0.05, 0.0578, 0.0241, 0.0611, 0.0315, 0.0393, 0.0298, -0.0021, -0.017, -0.0015, 0.0085, 0.0591, -0.0473, 0.0198, -0.0098, 0.0133, 0.0286, 0.0104, -0.0065, 0.0122, -0.0296, 0.0193, 0.0051, -0.001, 0.0691, 0.0454, 4e-04, -0.0089, 0.0221, -0.0167, 0.0197, -0.0065, 0.021, -0.015, 0.0234, -0.0051, -0.0027, 0.0064, -0.0354, 0.0166, 0.0142, 0.0128, -0.0022, -0.0138, -0.0241, 0.0114, -0.0124, -0.0131, 0.0189, -0.0208, 0.0075, 0.0425, 0.0682, 0.0025, -0.0016, 0.0438, -0.0362, 0.0081, -0.0077, -0.004, 0.0153, 0.0246, 0.0336, -0.0543, 0.0148, -0.0072, -0.0202, 9e-04, -0.0104, 0.027, 0.0655, 0.0413, 0.022, 0.0284, -0.0376, -0.0164, 0.0489, 0.0441, 0.0402, -0.0445, 0.0065, 0.049, -0.0192, -0.0171, 0.0078, -0.0019, 0.0104, 0.0018, 0.0381, 0.0199, 0.0529, -0.0051, -0.0532, -0.0118, -0.0316, -0.0119, -0.0084, 0.022, 0.0358, 0.0475, 0, -0.0438, 5e-04, -6e-04, -0.0354, 0.0232, 0.026, -0.0013, 0.01, 0.0079, -0.0092, 0.0379, -0.0153, 0.0174, -0.0186, 0.0284, 0.0387, -0.0146, -0.0142, -0.0216, 0.002, -0.0055, 0.0102, 0.0226, 0.0146, 0.0113, -0.0144, -0.0141, 0.0241, 0.023, 0.0229, -0.0122, -0.028, 0.0469, 0.028)
retData <- c(retData, -0.0016, 0.0117, 0.0255, 0.062, -0.0056, -0.0078, 0.0162, 0.033, -0.0333, -0.0114, 0.001, 0.0345, 0.0214, 0.014, -0.0016, -0.0031, -0.018, -0.014, 0.0213, -0.0147, -0.0012, 0.0054, 0.0178, 0.0122, -0.0012, 0.003, 0.0233, 0.0217, 0.0234, 0.0147, 0.035, -0.0064, 0.0054, 0.0073, 0.0095, 0.0227, 0.0252, 0.0165, 6e-04, -0.0047, -0.0069, -0.0836, -0.0215, -0.0029, 0.0164, 0.0108, 0.0181, -0.0021, 0.0159, 0.0418, 0.0207, 0.0273, 0.0084, 0.002, -0.0041, 0.0027, 0.022, 0.03, 0.0088, 0.0421, 0.0103, -0.0101, -0.0132, 0.0203, 0.0064, 0.014, -0.0019, -0.0073, -0.0209, 1e-04, 0.0308, 0.01, -0.0037, 0.0048, 0.0235, 0.036, 0.0073, 0.0106, -0.0014, 0.0103, 0.0086, 0.0015, 0.0186, -0.0033, 0.0052, 0.0139, 0.0091, -0.0117, -0.0133, 9e-04, -0.0044, -0.0031, 0.0239, 0.0222, 0.0243, 0.0092, 0.0113, 0.0345, 0.027, 0.0267, 0.0117, 0.0137, 0.0242, 0.0267, 0.0154, 0.0198, 0.0301, 0.0075, 0.0046, 0.0093, -0.001, 0.0202, 0.0019, 0.0088, 0.0104, 0.0143, 0.0337, 0.0266, 0.0037, 0.0134, 0.0032, -0.0052, 6e-04, 0.0133, 0.0173, 0.0124, 0.0112, -0.0032, 0.01, 0.0122, 0.0253, 0.0065, 0.0172, 0.0193, 0.0086, -0.0015, 9e-04, 0.0099, 0.0033, 0.0194, 0.0179, 0.0165, 0.015, 0.0145, 0.0108, 0.0164, 0.018, 0.0027, -0.0056, -0.0118, 0.0095, 0.0175, -0.0169, 2e-04, -0.0233, 0.0014, -0.0126, 0.0088, 0.0137, -0.0031, -0.0182, -0.0072, -0.0518, -0.0775, -0.0435, -0.0197, 0.0082, -0.0122, 0.0022, 0.0387, 0.0504, 0.0198, 0.0311, 0.0244, 0.0791, 0.0525, -0.012, 0.0119, 0.0315, 0.0581, 0.056, -0.0066, 0.0229, -0.0572, -0.0378, 0.016, -0.0429, 0.0339, 0.0318, 0.0041, -0.0825, -0.0422, 0.0019, -0.1922, -0.0395, 0.014, 0.043, -0.0098, -0.012, 0.0102, 0.0585, 0.063, 0.0061, 0.0654, -0.0061, -0.0147, -0.0069, 0.0288, 0.0692, 0.123, 0.0077, 0.0528, 0.0318, -0.0541, -0.0433, 0.0334, 0.0025, 0.0368, -0.0462, -0.0256, -0.0385, 0.0116, 0.0586, -0.0221, -0.0175, 0.0114, 0.0278, 0.016, -0.0286, 0.003, -0.0425, 0.0278, 0.0483, 0.0421, 0.0273, 0.0181, 0.0331, 0.0144, 1e-04, -0.0292, -0.0309, 0.0119, -0.0252, 0.0154, 0.019, 0.0048, 0.0012, 0.0084, 0.0019, 0.045, 0.0433, 0.0268, 0.0104, 0.0374, 0.0264, 0.0259, 0.0096, 0.0403, 0.0251, 0.0253, 0.0172, -0.0252, -0.0181, 0.002, -0.0027, 0.0133, 0.028, 0.0185, 0.0328, 0.0201, 0.0143, 0.0346, -0.0197, -0.0049, 0.0072, 0.016, 0.0257, 0.0152, 0.0402, -0.023, 0.0279, 0.0284, 0.0526, 0.0161, 0.0122, 0.0365, -0.0389, -0.0097, 0.0067, 0.0133, 0.0011, 0.0257, 0.0323, 0.0291, 0.0079, 0.01, 0.0185, 0.0255, 0.027, 0.0236, 0.0275, -0.0274, 0.0428, 0.0485, -0.0237, 0.013, -0.0503, 0.028, -0.0379, 0.019)
retData <- c(retData, 0.0163, -0.0274, -0.033, -0.0336, -0.0982, -0.1331, -0.0391, -0.001, -0.0112, -0.0133, 0.035, 0.0663, 0.0884, 0.0013, 0.0451, 0.0166, 0.0189, 0.0101, 0.0016, 0.0119, 0.0189, 0.0165, 0.0247, 0.0017, 0.0202, 0.0095, 0.0041, 0.0066, 0.006, 0.0135, 0.0179, 0.0067, 0.008, 0.0108, 0.0012, -0.0107, 0.0061, 0.0052, 0.0158, 0.0209, 0.0101, 0.0023, 0.0033, 0.0107, 0.0089, 0.0168, 0.0135, 0.0095, 0.0095, 0.0066, 0.0133, 0.0198, 0.0075, 0.0253, 0.0134, 0.0168, 0.0062, 0.0171, 0.0063, 0.021, 0.0058, 0.004, 0.0045, 0.016, 0.0075, 0.012, 0.0108, 0.0075, 0.0077, 0.0017, 0.0031, 0.0094, 0.0023, 0.0058, 0.0055, 0.0056, 0.0065, -7e-04, 0.0047, 0.0076, 0.0053, 0.0022, -0.0013, 0.0069, 0.0015, 0.0016, 0.0025, 0.0094, 0.0083, 0.0024, 0.0015, 0.0031, 0.0107, 0.0034, -6e-04, 0.0031, 0.0078, 0.0115, 0.0046, 0.0054, 0.0109, 0.0063, 0.0032, -0.0082, 0.0024, 0.0042, 6e-04, -9e-04, 0.0085, -5e-04, 0.014, 0.0058, 0.0081, 0.008, 0.0019, -0.003, 0.0047, 0.0081, 0.0078, 0.0062, 0.0087, 1e-04, 0.0061, 0.0068, 0.0115, 0.0046, 0.0098, 0.0102, 2e-04, 0.0063, 0.0051, -9e-04, 9e-04, 0.0065, 0.0075, 0.0107, 0.0083, 0.0051, 0.0101, 0.0089, 0.0121, 0.0077, 0.0051, -0.0094, 0.0123, 0.0168, -0.0018, 0.0054, -0.0112, 0.012, -0.0049, 0.0059, 0.0126, 0.0156, -0.01, -0.0135, -0.0285, -0.0044, -0.0587, 5e-04, 0.0079, -0.0046, 0.0021, -0.0012, 0.0146, 0.0036, 0.0042, 0.007, 0.0213, 0.0084, -0.0023, -5e-04, 0.0346, 0.0258, 0.0307, 0.0071, 0.0329, 0.0061, 0.0134, 0.0154, 0.0055, 0.0294, 0.0263, 0.0104, -0.0083, 2e-04, -0.0037, -0.0886, -0.011, 0.0091, 0.0244, 0.0219, 0.0201, -0.0042, 0.0193, 0.0429, 0.0215, 0.0297, 0.0096, -0.0027, 0.009, 0.0054, 0.0284, 0.0286, 0.0088, 0.0346, 0.0069, -0.0059, -0.0034, 0.0268, 0.0057, 0.0173, 0.0048, -0.0068, -0.0136, 0.0127, 0.0298, 0.0045, -0.0042, 0.011, 0.0185, 0.0063, 0.0049, 0.009, -0.0254, 0.0148, 0.0105, 0.0107, 0.0078, -0.0071, 0.0153, 0.0046, 1e-04, -0.0283, -0.03, 0.006, -0.007, 0.0031, 0.0216, 0.0044, 0.0154, 0.0026, 0.0083, 0.0272, 0.0301, 0.0181, 0.0119, 0.0133, 0.0133, 0.0191, 0.0116, 0.0172, 0.0234, 0.0113, 0.0016, 2e-04, -0.0023, 0.0113, -0.0082, 0.0035, 0.0103, 0.0124, 0.0306, 0.0244, 4e-04, 0.0144, -4e-04, -0.0128, 0.0065, 0.0133, 0.0215, 0.0092, 0.01, -0.0173, 0.0125, 0.0142, 0.0341, 0.0051, 0.0185, 0.0164, 8e-04, 0.0012, -0.0011, 0.0112, 0.0035, 0.0206, 0.0182, 0.0168, 0.0201, 0.0207, 0.0146, 0.0197, 0.0213, -7e-04, -0.0032, -0.0144, 0.0134, 0.0214, -0.0202, 7e-04, -0.0271, 0.0084, -0.0168, 0.0118, 0.0176, -0.0113, -0.0166, -0.0025, -0.0627, -0.0625, -0.0301, -0.0071, 0.0132, -0.0091, 0.0117, 0.0337, 0.0442, 0.0123, 0.0291, 0.0207, 0.0191, 0.0122, 0.0109, 0.013, 0.0118, 0.0108, 0.0095, 0.0087, 0.0119, -0.0032, 0.0053, 0.0079, -0.0026, 0.0098)
retData <- c(retData, 0.0128, 0.0075, 0.004, -0.008, 0.0106, -0.0143, -0.0362, -0.0801, 0.0052, 0.012, 0.0158, 0.0208, 0.016, 0.0106, 0.0072, 0.0088, 0.0051, -0.0028, 0.0092, 0.0087, 0.0106, 0.0097, 0.0041, 0.0097, -0.0061, -6e-04, 0.0107, 0.0058, 0.0018, 0.0107, 0.0076, 6e-04, 0.0066, 0.0048, 0.0163, 0.0054, 0.0051, 0.0094, 0.0068, 0.0017, 0.0054, 0.0105, -0.0013, 0.0134, -0.0024, 0.0053, 0.0086, 0.0056, 0.0045, 0.0113, 0.0099, 0.0069, 0.0057, 0.0097, -0.0033, -0.0063, 0.0054, 0.0153, 0.0106, 0.0079, 0.0019, 0.0091, 0.0207, 0.0044, -0.0092, 0.0043, 0.0105, 0.0035, 0.0069, 0.0101, 0.0092, 0.0084, 3e-04, 0.0062, 0.004, 0.0055, 0.0062, 0.0036, 0.0012, 0.0028, 0.0075, 0.006, 0.0044, 0.0085, 0.0024, -3e-04, -0.001, 0.001, 0.0081, 0.0036, 0.0062, 0.0057, 0.0015, 0.0054, 0.0093, 0.0041, 0.0055, 0.0121, 0.0059, 0.0036, 0.0064, 0.0037, 0.0014, 0.0067, 0.006, 0.0072, 0.0069, 0.0106, 0.006, 0.0071, 0.0055, 0.0048, 7e-04, -0.0048, 0.0164, 0.0114, -0.0094, 0.0036, -0.0012, -0.0049, -0.0306, 0.0187, 0.0103, -0.0027, -0.0023, -3e-04, -0.0506, -0.0867, -0.0308, -0.0035, 0.0112, 0.0065, 0.0057, 0.0221, 0.0365, 0.0126, 0.0322, 0.0202, 0.0573, 0.0175, -0.0119, 0.0172, 0.0108, 0.0218, 0.0738, -0.018, 0.029, -0.0142, 0.0106, 0.0264, -0.005, 0.0128, 0.057, 0.0034, 0.0095, 0.012, 0.0058, -0.0263, -0.0059, -0.0223, 0.0194, 0.0233, 0.0086, -0.0111, 0.0024, 0.0329, -0.0055, 0.0214, -0.0018, -0.0061, -2e-04, 0.0073, 0.0405, 0.0612, 0.0021, 0.0408, -0.0104, -0.0304, -0.007, 0.0154, 0.0037, 0.0248, -0.0149, -0.0024, 0.0125, 0.0472, 0.0214, -0.0072, 0.0038, 0.0049, 0.0032, 0.0017, -0.004, 6e-04, -0.007, 0.0208, 0.0021, 0.0138, 0.0069, -0.0035, 0.0064, 0.0098, 0.0123, -0.0022, -0.0078, 0.0063, 0.0054, -0.0086, 0.0047, 0.0192, 0.0182, 0.0166, -0.0122, 0.0117, 0.0397, 0.0056, -0.0035, 0.0202, 0.0215, 0.0111, 0.0031, 0.0293, 0.0117, 0.015, 0.0064, -0.0178, -0.0081, -0.0019, -0.0014, -0.0039, 8e-04, 0.0138, 0.028, 0.0033, -0.0047, 0.0171, -0.0027, -0.008, 0.0088, 0.0116, 0.0119, 0.0083, 0.0269, -0.0074, 0.0164, 0.0135, 0.0258, 2e-04, 0.0094, 0.0238, -0.0155, -0.0015, 6e-04, -0.0039, -0.0067, 0.0097, 0.0199, 0.0116, 0.0061, 0.0018, 0.0027, 0.0152, 0.0192, 0.0107, 0.0116, -0.0116, 0.033, 0.0304, -0.0063, 0.0104, -0.001, 0.0312)
retData <- c(retData, -0.0169, 0.0078, 0.0114, 0.003, -0.0213, -0.0133, -0.0313, -0.0157, 0.0033, 0.0118, 0.0029, -0.0055, 0.0048, 0.0127, 0.0348, -0.0076, 0.0166, 0.005, 0.0281, -6e-04, -0.0084, 0.0084, 0.0394, 0.0223, 0.0454, 0.0107, 0.0429, 0.001, -0.0026, 0.0104, 0.0013, 0.0342, 0.0336, 0.012, -0.0087, 0.0167, -6e-04, -0.0552, 0.0206, 0.0169, 0.0291, 0.0408, 0.0258, -0.0169, 0.0229, 0.0312, 0.0095, 0.0315, 0.0177, 0.0022, 0.0113, 0.0212, 0.0481, 0.0745, 0.0075, 0.0699, 6e-04, -0.0201, -0.0097, 0.0349, 6e-04, 0.0345, -0.0016, -0.0084, -0.0153, 0.0248, 0.0165, -0.0264, -0.0199, 0.0246, 0.0043, 0.0019, -0.0144, -0.0096, -0.0348, 0.0099, 0.02, 0.018, -0.0037, -0.0123, 0.0155, -0.0042, -0.0034, -0.0249, -0.0389, 0.0041, -0.016, 0.0123, 0.0224, -0.0149, 5e-04, -0.0037, 0.002, 0.0298, 0.0362, 0.0128, 0.0118, 0.0179, 0.0094, 0.0299, 0.013, 0.0191, 0.0192, 0.0123, 0.0041, -0.0165, -0.0035, 0.0091, -0.0154, -0.0022, 0.021, 0.0074, 0.0308, 0.0178, -0.0017, 0.021, -0.0096, -0.0184, 0.0115, 0.0195, 0.0265, 0.0097, 0.0222, -0.0174, 0.0211, 0.0249, 0.0381, 0.0016, 0.0238, 0.0172, -0.0248, -0.0062, -0.0031, 0.0114, 5e-04, 0.0194, 0.02, 0.0153, 0.0121, 0.0082, 0.0115, 0.0198, 0.0224, 0.0077, 9e-04, -0.016, 0.0256, 0.0281, -0.0225, 0.0043, -0.04, 0.014, -0.0236, 0.0223, 0.0227, -0.0164, -0.0261, -0.0146, -0.0675, -0.0629, -0.0188, 0.0081, -0.0017, -0.0161, 0.0188, 0.0375, 0.0516, 9e-04, 0.0277, 0.0157, 0.015, 0.0034, 0.006, -1e-04, 0.0197, 0.0231, 0.02, 0.0079, 0.0197, 0.0094, 0.0223, 0.0158, 0.0055, 0.0212, 0.0164, 0.0139, -9e-04, 0.0072, 7e-04, -0.0544, 0.0076, 0.0159, 0.022, 0.0224, 0.0112, 0.0036, 0.0133, 0.0218, 0.021, 0.0222, 0.0147, 0.005, 0.0116, 0.0096, 0.0237, 0.009, 0.0143, 0.0239, 0.0131, 0.0188, 0.0146, 0.0167, 0.0116, 0.0157, 0.0137, 0.0026, 0.0102, 0.0125, 0.0111, 0.0054, -0.0061, 0.0058, 0.0161, -0.0087, 0.0079, 0.0099, -0.0267, 0.0085, 0.0014, 0.0045, 0.0077, -0.0044, 0.0073, -0.0013, 0, -0.017, -0.0174, 0.0061, -0.0028, 0.0032, 0.0054, 0.0046, 0.004, 0.0018, -7e-04, 0.0099, 0.0154, 0.0048, 0.0053, 0.007, 0.0077, 0.0111, 0.0044, 0.0098, 0.0097, 0.0051, 0.0017, -0.0039, 0, 0.0017, -0.0092, 0.0011, 0.0042, 0.0074, 0.0164, 0.0133, 0, 0.0065, 0.0032, -0.0105, 0.0095, 0.0085, 0.0115, 0.0061, 0.0035, -0.0145, 0.0112, 0.0138, 0.0272, 0.0104, 0.0144, 0.0119, 9e-04, 0.0087, 0.0058, 0.0053, 0.0041, 0.0132, 0.0142, 0.0133, 0.0191, 0.0255, 0.0063, 0.016, 0.0171, -0.0053, -0.0054, 1e-04, 0.0131, 0.0191, -0.0149, -0.0025, -0.0126, 0.006, -0.0045, 0.0149, 0.0136, -0.0109, 0.0011, 0.0051, -0.0276, -0.0245, 6e-04, 0.0162, 0.0056, 6e-04, 0.0125, 0.0081, 0.0107, 0.0104, 0.0068, 0.0102, 0.018, 0.0118, 0.001, 0.0122, 0.0173, 0.0198, 0.0181, 0.0103, 0.0183, 0.0079, 0.0111, 0.0082, 0.0132, 0.013)
retData <- c(retData, 0.0145, 0.0145, 0.0053, 0.0026, 0.0011, -0.0341, 5e-04, -0.014, 0.0198, 0.0164, 0.0195, 0.0085, 0.0116, 0.0238, 0.0146, 0.0148, 0.011, 0.0062, 0.0105, 0.007, 0.0137, 0.0183, 0.0173, 0.0185, 0.0163, 0.0092, 0.008, 0.0176, 0.0084, 0.0157, 0.0075, -4e-04, 6e-04, 0.0075, 0.0333, 0.003, -0.0011, 0.0174, 0.0141, 0.0019, 0.001, -0.0031, -0.0221, 0.0164, 0.0136, 0.0097, 0.0097, -0.0011, 0.0145, 0.007, 0.0031, -0.0107, -0.0185, 0.0058, -0.011, 0.0084, 0.0185, 0.0023, 0.0067, -4e-04, 0.0049, 0.0186, 0.0212, 0.0071, 0.0041, 0.0058, 0.0086, 0.0159, 0.0102, 0.0127, 0.0146, 0.0057, 0.0038, -0.0045, -0.0037, 0.0022, 7e-04, 0.0031, 0.0052, 0.004, 0.0149, 0.0099, 0.0012, 0.0081, -0.0042, -0.0108, -2e-04, 0.0095, 0.0149, 0.0053, 0.0122, -0.0038, 0.0067, 0.0126, 0.0238, 0.0073, 0.0157, 0.0126, -0.0025, 0.0021, 0.0017, 0.0092, 0.004, 0.0132, 0.0129, 0.0128, 0.0135, 0.0114, 0.0081, 0.0134, 0.0156, 0.01, 4e-04, -0.0077, 0.0153, 0.02, -0.0112, 0.0022, -0.0118, 0.0064, -0.0162, 0.013, 0.0159, -0.0084, -0.0125, -0.0023, -0.0538, -0.0692, -0.0209, 0.0031, 0.01, -0.0016, 0.01, 0.0342, 0.0392, 0.0101, 0.026, 0.0162, -0.0166, 0.0426, 0.0778, -0.0129, -0.0737, -0.0065, -0.0429, -0.0072, -0.0155, 0.0572, 0.0217, 0.0161, 0.0014, 0.0155, 0.0637, 0.0657, 0.1437, -0.0053, 0.0343, 0.2463, -0.0376, -0.1077, -0.0756, -0.0531, -0.0665, 0.0833, -0.0154, -0.0375, 9e-04, -0.0412, 0.0092, 0.0468, 0.0401, -0.013, -0.1239, -0.1137, 0.0427, -0.134, -0.023, 0.1028, 0.0704, -0.1107, 0.0553, -0.1135, 0.1204, 0.0784, 0.1657, 0.0063, -0.0271, 0.1021, 0.062, -0.0991, -0.013, 0.011, 0.0353, 0.0752, 0.0941, -0.0298, -0.0655, -0.0251, 0.0343, 0.039, -0.0446, 0.0483, 0.0346, 0.0548, 0.0644, 0.0015, 0.0731, -0.0405, -0.0547, 0.0443, 0.0162, 0.013, -0.0075, -0.0656, -0.0499, -0.0162, -0.0361, -0.0354, 0.0136, -0.0656, -0.0136, -0.0178, -0.009, 0.0018, -0.0148, 0.0384, -0.0024, -0.0051, 0.0638, 0.0126, -0.0216, -0.0092, -0.0574, -0.0391, 0.0387, 0.0118, 0.0244, 0.0393, -0.0475, -0.0032, -0.0242, 0.0259, 0.0198, 0.0233, -0.03, -0.0035, -0.0288, 0.0064, -0.0139, -0.0012, 0.0246, 0.0118, 0.0173, -0.0156, -0.0236, -0.038, -0.0268)
retData <- c(retData, 0.0039, -0.0107, 0.0028, -0.0051, -0.0265, -0.0199, 0.0236, 0.0486, 0.0092, -0.0207, -0.0026, 0.0719, 0.0056, 0.0556, 0.03, 0.0192, -0.0461, -0.0142, 0.0751, 0.0072, -0.0215, 0.0378, 0.117, 0.0428, -0.0146, 0.0282, 0.0328, -0.0462, -0.082, 8e-04, -0.0094, -0.0596, -0.0165, 0.0317, 0.0106, -0.0077, 9e-04, 0.0275, 0.0225, 0.0435, 0.0051, 0.0334, -0.0099, -0.0034, 0.0089, -0.0036, 0.0256, 0.0373, 0.0125, -0.0072, 0.0021, -7e-04, -0.0616, -0.0037, -2e-04, 0.022, 0.0222, 0.0202, -0.0063, 0.0213, 0.04, 0.0119, 0.0282, 0.0088, 0.0028, 0.0052, 0.013, 0.0483, 0.0622, 0.0169, 0.0666, 0.0039, -0.0269, -0.0122, 0.0311, -0.0022, 0.0267, -0.0069, -0.0104, -0.0205, 0.0133, 0.0223, -0.0089, -0.0068, 0.0104, 0.008, 0.0013, -0.004, 0.0019, -0.0142, 0.0095, 0.0058, 0.0099, 0.003, -0.0015, 0.009, 0.0052, 0.005, -0.0095, -0.014, 0.0037, -0.0033, -0.0031, 0.0106, 0.0077, 0.0072, 0.0031, -4e-04, 0.0134, 0.0205, 0.0068, 0.0025, 0.0078, 0.0121, 0.0152, 0.007, 0.0139, 0.0156, 0.0111, 0.0043, -0.0068, -0.0082, 0.0034, -0.0049, -0.001, 0.0099, 0.0068, 0.0244, 0.0145, 6e-04, 0.0136, -0.0044, -0.0141, 0.0018, 0.0131, 0.0134, 0.0079, 0.0147, -0.0149, 0.016, 0.0191, 0.0286, 0.0037, 0.0164, 0.0171, -0.0133, -0.0028, -5e-04, 0.0066, -3e-04, 0.0163, 0.0185, 0.0175, 0.0121, 0.0096, 0.0096, 0.0163, 0.0204, 0.0082, 0.0041, -0.0222, 0.0199, 0.0303, -0.0148, 0.004, -0.0272, 0.0142, -0.0262, 0.0097, 0.0172, -0.0068, -0.0264, -0.0156, -0.0618, -0.06, -0.0192, -0.0119, 0.006, -0.0037, 8e-04, 0.0092, 0.0312, 0.0024, 0.0153, 0.0113)
idxData <- as.Date(c(9892, 9920, 9951, 9981, 10012, 10042, 10073, 10104, 10134, 10165, 10195, 10226, 10257, 10285, 10316, 10346, 10377, 10407, 10438, 10469, 10499, 10530, 10560, 10591, 10622, 10650, 10681, 10711, 10742, 10772, 10803, 10834, 10864, 10895, 10925, 10956, 10987, 11016, 11047, 11077, 11108, 11138, 11169, 11200, 11230, 11261, 11291, 11322, 11353, 11381, 11412, 11442, 11473, 11503, 11534, 11565, 11595, 11626, 11656, 11687, 11718, 11746, 11777, 11807, 11838, 11868, 11899, 11930, 11960, 11991, 12021, 12052, 12083, 12111, 12142, 12172, 12203, 12233, 12264, 12295, 12325, 12356, 12386, 12417, 12448, 12477, 12508, 12538, 12569, 12599, 12630, 12661, 12691, 12722, 12752, 12783, 12814, 12842, 12873, 12903, 12934, 12964, 12995, 13026, 13056, 13087, 13117, 13148, 13179, 13207, 13238, 13268, 13299, 13329, 13360, 13391, 13421, 13452, 13482, 13513, 13544, 13572, 13603, 13633, 13664, 13694, 13725, 13756, 13786, 13817, 13847, 13878, 13909, 13938, 13969, 13999, 14030, 14060, 14091, 14122, 14152, 14183, 14213, 14244, 14275, 14303, 14334, 14364, 14395, 14425, 14456, 14487))
nameData <- c('Convertible Arbitrage', 'CTA Global', 'Distressed Securities', 'Emerging Markets', 'Equity Market Neutral', 'Event Driven', 'Fixed Income Arbitrage', 'Global Macro', 'Long/Short Equity', 'Merger Arbitrage', 'Relative Value', 'Short Selling', 'Funds of Funds')
asset_returns <- xts(matrix(retData, ncol=13), order.by=idxData)
names(asset_returns) <- nameData
str(asset_returns)
## An 'xts' object on 1997-01-31/2009-08-31 containing:
## Data: num [1:152, 1:13] 0.0119 0.0123 0.0078 0.0086 0.0156 0.0212 0.0193 0.0134 0.0122 0.01 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:13] "Convertible Arbitrage" "CTA Global" "Distressed Securities" "Emerging Markets" ...
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## NULL
# Get the column names of the returns data
asset_names <- colnames(asset_returns)
# Create a portfolio specification object using asset_names
port_spec <- portfolio.spec(assets=asset_names)
# Get the class of the portfolio specification object
class(port_spec)
## [1] "portfolio.spec" "portfolio"
# Print the portfolio specification object
port_spec
## **************************************************
## PortfolioAnalytics Portfolio Specification
## **************************************************
##
## Call:
## portfolio.spec(assets = asset_names)
##
## Number of assets: 13
## Asset Names
## [1] "Convertible Arbitrage" "CTA Global" "Distressed Securities"
## [4] "Emerging Markets" "Equity Market Neutral" "Event Driven"
## [7] "Fixed Income Arbitrage" "Global Macro" "Long/Short Equity"
## [10] "Merger Arbitrage"
## More than 10 assets, only printing the first 10
# Constraints are added to the portfolio specification object with the add.constraint() function
# Each constraint added is a separate object and stored in the constraints slot in the portfolio object
# In this way, the constraints are modular and one can easily add, remove, or modify the constraints in the portfolio object
# The required arguments for add.constraint() are the portfolio the constraint is added to, the constraint type, and named arguments passed via ... to the constructor of the constraint type.
# Specify the constraint on the sum of the weights
# weight_sum, weight, leverage
# full_investment is a special case that sets min_sum = max_sum = 1
# dollar_neutral is a special case that sets min_sum = max_sum = 0
# Specify constraints for the individual asset weights
# box
# long_only is a special case that sets min = 0 and max = 1
# Specify the constraint for the sum of weights of assets by group (sector, region, asset class, etc.)
# group
# Specify a constraint on the target mean return
# return
# Add the weight sum constraint
port_spec <- add.constraint(portfolio = port_spec, type = "weight_sum", min_sum = 1, max_sum = 1)
# Add the box constraint
port_spec <- add.constraint(portfolio = port_spec, type = "box", min = c(rep(0.1, 5), rep(0.05, 8)), max = 0.4)
# Add the group constraint
port_spec <- add.constraint(portfolio = port_spec, type = "group", groups = list(c(1, 5, 7, 9, 10, 11), c(2, 3, 4, 6, 8, 12)), group_min = 0.4, group_max = 0.6)
# Print the portfolio specification object
print(port_spec)
## **************************************************
## PortfolioAnalytics Portfolio Specification
## **************************************************
##
## Call:
## portfolio.spec(assets = asset_names)
##
## Number of assets: 13
## Asset Names
## [1] "Convertible Arbitrage" "CTA Global" "Distressed Securities"
## [4] "Emerging Markets" "Equity Market Neutral" "Event Driven"
## [7] "Fixed Income Arbitrage" "Global Macro" "Long/Short Equity"
## [10] "Merger Arbitrage"
## More than 10 assets, only printing the first 10
##
## Constraints
## Enabled constraint types
## - weight_sum
## - box
## - group
# Objectives are added to the portfolio object with the add.objective() function
# Each objective added is a separate object and stored in the objectives slot in the portfolio specification object
# In this way, the objectives are modular and one can easily add, remove, or modify the objective objects
# The name argument must be a valid R function
# Several functions are available in the PerformanceAnalytics package, but user defined functions can also be used as objective functions
# return: This objective type seeks to maximize the objective.
# risk: This objective type seeks to minimize the objective.
# risk_budget: This objective type seeks to minimize risk concentration or penalize contribution to risk that exceeds the minimum or maximum allowable percentage contribution to risk.
# Add a return objective to maximize mean return
port_spec <- add.objective(portfolio = port_spec, type = "return", name = "mean")
# Add a risk objective to minimize portfolio standard deviation
port_spec <- add.objective(portfolio = port_spec, type = "risk", name = "StdDev")
# Add a risk budget objective
port_spec <- add.objective(portfolio = port_spec, type = "risk_budget", name = "StdDev", min_prisk = 0.05, max_prisk = 0.1)
# Print the portfolio specification object
print(port_spec)
## **************************************************
## PortfolioAnalytics Portfolio Specification
## **************************************************
##
## Call:
## portfolio.spec(assets = asset_names)
##
## Number of assets: 13
## Asset Names
## [1] "Convertible Arbitrage" "CTA Global" "Distressed Securities"
## [4] "Emerging Markets" "Equity Market Neutral" "Event Driven"
## [7] "Fixed Income Arbitrage" "Global Macro" "Long/Short Equity"
## [10] "Merger Arbitrage"
## More than 10 assets, only printing the first 10
##
## Constraints
## Enabled constraint types
## - weight_sum
## - box
## - group
##
## Objectives:
## Enabled objective names
## - mean
## - StdDev
## - StdDev
# There are two functions for running the optimization, optimize.portfolio() and optimize.portfolio.rebalancing()
# This exercise will focus on single period optimization and the next exercise will use optimize.portfolio.rebalancing() for optimization with periodic rebalancing. optimize.portfolio() supports single-period optimization
# Key arguments include R for the asset returns, portfolio for the portfolio specification object, and optimize_method to specify the optimization method used to solve the problem
# In many cases, it is useful to specify trace = TRUE to store additional information for each iteration/trial of the optimization.
# The following optimization methods are supported:
# DEoptim: Differential evolution
# random: Random portfolios
# GenSA: Generalized Simulated Annealing
# pso: Particle swarm optimization
# ROI: R Optimization Infrastructure for linear and quadratic programming solvers
# The optimization method you choose should be based on the type of problem you are solving
# For example, a problem that can be formulated as a quadratic programming problem should be solved using a quadratic programming solver, whereas a non-convex problem should be solved using a global solver such as DEoptim.
# In this exercise, we will define the portfolio optimization problem to maximize mean return and minimize portfolio standard deviation with a standard deviation risk budget where the minimum percentage risk is 5% and the maximum percentage risk is 10%, subject to full investment and long only constraints
# The risk budget objective requires a global solver so we will solve the problem using random portfolios
# The set of random portfolios, rp, is generated using 500 permutations for this exercise
rpData <- c(0.0769, 0, 0, 0, 0, 0.01, 0.012, 0.326, 0, 0.006, 0.058, 0.112, 0.002, 0.028, 0.002, 0.016, 0.056, 0.004, 0.056, 0, 0.178, 0, 0.002, 0.34, 0, 0.008, 0.03, 0.078, 0.014, 0.088, 0.066, 0.006, 0.018, 0.072, 0.106, 0.212, 0.08, 0.032, 0, 0, 0.07, 0.204, 0.01, 0.006, 0.018, 0, 0.022, 0.036, 0.022, 0.08, 0.174, 0.22, 0.012, 0.024, 0.134, 0.004, 0.094, 0.162, 0.048, 0, 0.002, 0.01, 0, 0.046, 0.296, 0.026, 0.01, 0.01, 0, 0.004, 0.004, 0.024, 0, 0.25, 0.112, 0.082, 0, 0.002, 0.118, 0.012, 0.31, 0.048, 0.324, 0.014, 0, 0.376, 0, 0.034, 0.018, 0.002, 0.038, 0, 0.184, 0.004, 0.074, 0.038, 0.008, 0.214, 0.218, 0, 0.06, 0, 0.326, 0, 0.006, 0.108, 0, 0.022, 0.036, 0.09, 0.496, 0, 0, 0, 0.106, 0.044, 0.134, 0, 0, 0.014, 0.004, 0.008, 0.106, 0.292, 0.178, 0, 0.004, 0, 0, 0.182, 0.224, 0, 0.042, 0, 0.374, 0.032, 0.274, 0.186, 0.092, 0.008, 0, 0, 0.306, 0.146, 0, 0.024, 0.126, 0.16, 0.016, 0.058, 0.068, 0.01, 0.138, 0.302, 0.054, 0.042, 0.002, 0, 0, 0, 0.026, 0.004, 0, 0.018, 0.09, 0, 0, 0, 0.046, 0.008, 0.212, 0.016, 0, 0.014, 0, 0.152, 0.006, 0, 0.298, 0, 0, 0.018, 0.058, 0.13, 0.678, 0, 0.424, 0.012, 0.014, 0.032, 0.036, 0, 0.482, 0, 0.004, 0.38, 0.002, 0, 0.1, 0.002, 0.002, 0.084, 0.158, 0.026, 0.034, 0.094, 0, 0, 0.006, 0.51, 0.01, 0.106, 0.078, 0, 0.35, 0, 0.046, 0.024, 0.1, 0.34, 0, 0, 0.048, 0.24, 0.038, 0.008, 0.04, 0, 0.004, 0.024, 0.004, 0, 0.038, 0.262, 0.046, 0.006, 0.346, 0.004, 0, 0.578, 0.724, 0, 0.308, 0.008, 0, 0, 0.046, 0.048, 0.004, 0.212, 0, 0.038, 0.026, 0.114, 0.008, 0.44, 0.088, 0, 0.65, 0.202, 0.174, 0.164, 0.052, 0.486, 0.01, 0, 0.034, 0.124, 0.006, 0.012, 0, 0.66, 0.164, 0, 0.428, 0.478, 0.516, 0.096, 0, 0.33, 0, 0.004, 0.08, 0, 0.092, 0, 0.024, 0.06, 0, 0.002, 0.048, 0.04, 0.002, 0.172, 0, 0.018, 0.046, 0.094, 0.028, 0.046, 0.062, 0.036, 0.01, 0.694, 0.032, 0.01, 0.02, 0.022, 0.036, 0.012, 0, 0.356, 0, 0.008, 0, 0.128, 0.412, 0.154, 0.11, 0.176, 0, 0.02, 0.418, 0.016, 0, 0.034, 0.068, 0.12, 0.058, 0.006, 0.002, 0.002, 0.03, 0.208, 0, 0.034, 0.054, 0.1, 0.73, 0.056, 0, 0.414, 0.006, 0.008, 0.012, 0.03, 0.01, 0.182, 0.156, 0.052, 0, 0.034, 0.01, 0.112, 0, 0.358, 0.016, 0.02, 0, 0.062, 0.068, 0.006, 0.01, 0.052, 0.006, 0.03, 0.122, 0.004, 0, 0.002, 0, 0.014, 0.004, 0, 0, 0.048, 0.004, 0, 0.464, 0.018, 0.022, 0.034, 0, 0.024, 0.186, 0, 0.134, 0.064, 0, 0.094, 0, 0, 0.122, 0.09, 0.01, 0.744, 0.002, 0.022, 0.36, 0.166, 0.042, 0.14, 0.206, 0.242, 0, 0.006, 0.002, 0.022, 0.026, 0.196, 0, 0, 0.004, 0.034, 0.228, 0.172, 0.002, 0.014, 0.052, 0.012, 0, 0.162, 0.046, 0.044, 0.494, 0.128, 0.202, 0.002, 0, 0.096, 0.002, 0.156, 0.06, 0, 0.126, 0, 0.062, 0.1, 0.004, 0.046, 0.02, 0.098, 0, 0, 0.084, 0.028, 0.012, 0.114, 0.116, 0.018, 0.074, 0.018, 0.012, 0.16, 0.036, 0.046, 0.024, 0.098, 0.014, 0, 0.002, 0.054, 0.038)
rpData <- c(rpData, 0.092, 0.02, 0.124, 0.108, 0.056, 0.1, 0.242, 0, 0.066, 0.144, 0.082, 0.006, 0.018, 0.082, 0.088, 0.098, 0.088, 0, 0.18, 0.04, 0.008, 0.012, 0, 0.164, 0.016, 0, 0.254, 0.192, 0.044, 0, 0.17, 0.192, 0.072, 0.02, 0, 0.184, 0.0769, 0.118, 0.058, 0, 0.066, 0.086, 0, 0.002, 0.046, 0.016, 0, 0.004, 0, 0, 0.174, 0.296, 0.144, 0.092, 0.028, 0.2, 0.04, 0.004, 0.236, 0.03, 0, 0.23, 0.192, 0.016, 0.152, 0.032, 0.008, 0, 0.27, 0.01, 0.598, 0.078, 0, 0.078, 0.142, 0.022, 0, 0, 0, 0.032, 0.526, 0.2, 0, 0.024, 0, 0.016, 0, 0, 0.002, 0.202, 0.008, 0.302, 0, 0.232, 0.178, 0.026, 0.622, 0.024, 0, 0.15, 0.098, 0, 0.026, 0.23, 0.006, 0.004, 0, 0.004, 0.3, 0.004, 0, 0, 0, 0, 0.006, 0, 0.022, 0.062, 0.122, 0.004, 0.618, 0.174, 0, 0.032, 0.562, 0.006, 0.2, 0.01, 0.002, 0.048, 0, 0, 0.04, 0.194, 0.026, 0, 0.08, 0.272, 0.076, 0.032, 0.306, 0.15, 0.004, 0.04, 0.062, 0.014, 0.008, 0, 0.006, 0.002, 0.07, 0, 0.008, 0.002, 0, 0, 0.038, 0.252, 0, 0.078, 0.016, 0.008, 0.426, 0.17, 0, 0, 0.142, 0, 0, 0.002, 0.006, 0.054, 0.144, 0.002, 0, 0.134, 0.01, 0.044, 0.088, 0.006, 0, 0.004, 0.076, 0.07, 0.068, 0.218, 0, 0.028, 0.102, 0, 0.246, 0.028, 0.022, 0.008, 0.206, 0.002, 0.088, 0.038, 0.54, 0, 0.012, 0.04, 0.07, 0.022, 0.036, 0, 0.488, 0, 0, 0.104, 0.032, 0.002, 0.008, 0, 0.008, 0.024, 0.23, 0, 0.15, 0.01, 0.064, 0.002, 0, 0.154, 0.004, 0.146, 0.052, 0.004, 0.17, 0.232, 0.016, 0, 0, 0.012, 0.158, 0.028, 0.054, 0.36, 0, 0.002, 0.106, 0.008, 0.106, 0.132, 0, 0, 0.236, 0.048, 0, 0.322, 0.036, 0, 0.274, 0.066, 0, 0.018, 0, 0.064, 0.006, 0.076, 0.03, 0.004, 0.038, 0.026, 0.218, 0, 0.002, 0.024, 0.008, 0.02, 0, 0.128, 0.23, 0.002, 0.002, 0.062, 0.014, 0.01, 0.02, 0.01, 0, 0.004, 0, 0.092, 0.052, 0, 0.096, 0, 0.038, 0.544, 0, 0.02, 0.018, 0.058, 0.062, 0.034, 0.002, 0.004, 0, 0, 0.222, 0.068, 0, 0.004, 0.588, 0.066, 0, 0.018, 0.038, 0.002, 0.01, 0, 0, 0.268, 0.006, 0, 0.034, 0.27, 0.074, 0, 0, 0.088, 0.002, 0.004, 0, 0.004, 0.106, 0.06, 0, 0, 0.01, 0, 0.01, 0, 0, 0.03, 0, 0.002, 0.016, 0, 0.094, 0.014, 0.15, 0, 0.026, 0.002, 0.002, 0.418, 0.512, 0.056, 0.338, 0, 0.006, 0.114, 0.01, 0, 0.018, 0.056, 0, 0.022, 0, 0.054, 0.002, 0.586, 0, 0.202, 0.002, 0.022, 0.47, 0.026, 0.046, 0.32, 0.158, 0.006, 0, 0.176, 0.01, 0.01, 0, 0, 0.166, 0.034, 0.01, 0, 0.002, 0.096, 0.012, 0.416, 0, 0.034, 0.236, 0.032, 0.016, 0.03, 0.026, 0, 0, 0.048, 0.102, 0.038, 0.08, 0.202, 0.016, 0.638, 0.07, 0.134, 0, 0, 0, 0, 0.4, 0.02, 0.022, 0.002, 0.01, 0, 0.262, 0.024, 0.266, 0.004, 0, 0.022, 0.04, 0.028, 0, 0.176, 0.308, 0.172, 0.01, 0.008, 0.016, 0, 0.092, 0.04, 0.162, 0.112, 0.07, 0, 0.018, 0, 0.032, 0, 0.028, 0, 0.014, 0.01, 0.334, 0.006, 0.102, 0, 0.24, 0.074, 0, 0.486, 0.006, 0, 0.042, 0.004, 0.056, 0.028, 0, 0.152, 0.198, 0.094, 0.008, 0.028)
rpData <- c(rpData, 0.046, 0.014, 0, 0, 0.156, 0.018, 0.072, 0.048, 0, 0, 0.008, 0.038, 0.572, 0.01, 0.03, 0.004, 0.104, 0.158, 0.01, 0.002, 0.004, 0, 0.106, 0.188, 0.114, 0, 0.012, 0.01, 0.02, 0, 0, 0.114, 0, 0.002, 0.016, 0, 0.18, 0, 0.022, 0, 0.394, 0.008, 0.016, 0.146, 0.066, 0.364, 0.028, 0.014, 0.526, 0.08, 0.006, 0.044, 0.048, 0, 0.018, 0.506, 0.006, 0.028, 0, 0.19, 0.008, 0, 0.022, 0, 0.024, 0.002, 0.122, 0, 0.092, 0.0769, 0, 0, 0.004, 0.094, 0.094, 0.304, 0.004, 0.006, 0.604, 0.12, 0.008, 0.006, 0.38, 0.03, 0, 0.112, 0, 0.156, 0.006, 0, 0.022, 0, 0.004, 0, 0.014, 0.18, 0, 0, 0, 0.006, 0.062, 0.024, 0.012, 0, 0.022, 0, 0, 0, 0, 0.012, 0.048, 0.144, 0.066, 0.002, 0.458, 0.088, 0.004, 0.374, 0, 0.198, 0.056, 0.006, 0.01, 0.18, 0.036, 0.002, 0.006, 0.026, 0.292, 0.008, 0.042, 0.122, 0.016, 0.006, 0.026, 0.066, 0.032, 0.002, 0.132, 0, 0.004, 0, 0.226, 0.11, 0.232, 0.002, 0.002, 0.002, 0, 0, 0.006, 0, 0.116, 0, 0.09, 0.186, 0.02, 0.01, 0, 0.112, 0.01, 0, 0.012, 0, 0.006, 0, 0.184, 0.062, 0.096, 0.034, 0.468, 0.018, 0, 0.006, 0.004, 0, 0.028, 0.046, 0.002, 0, 0.758, 0.174, 0.07, 0, 0.002, 0.02, 0.002, 0, 0, 0.03, 0.002, 0.092, 0.01, 0.034, 0.066, 0.004, 0, 0.216, 0.024, 0.014, 0.014, 0.376, 0.01, 0.06, 0, 0, 0.022, 0.004, 0.038, 0.312, 0.008, 0.104, 0.002, 0.01, 0, 0.114, 0.2, 0.152, 0.104, 0.016, 0.036, 0.132, 0.008, 0.004, 0.598, 0.048, 0.004, 0.022, 0, 0.232, 0.004, 0.014, 0.042, 0.13, 0.006, 0.002, 0, 0.1, 0.182, 0.038, 0, 0, 0.022, 0.08, 0.154, 0, 0, 0, 0.156, 0.256, 0.394, 0.008, 0, 0.004, 0.002, 0.192, 0.034, 0, 0.046, 0.294, 0.154, 0.038, 0.004, 0.046, 0.06, 0.006, 0.116, 0.044, 0.014, 0.022, 0, 0.47, 0.05, 0.006, 0, 0.008, 0.066, 0.004, 0.03, 0, 0.034, 0, 0.216, 0.21, 0.038, 0.078, 0.05, 0.602, 0.004, 0.014, 0.096, 0, 0.2, 0.158, 0.67, 0.148, 0, 0.01, 0, 0.134, 0.002, 0.45, 0.01, 0, 0.186, 0.01, 0.044, 0, 0.008, 0.002, 0.214, 0.004, 0.028, 0, 0.026, 0.138, 0.018, 0.112, 0.01, 0.174, 0.05, 0.03, 0.072, 0, 0.162, 0, 0.002, 0.054, 0, 0.024, 0.022, 0.002, 0.294, 0.024, 0.02, 0.05, 0.008, 0.054, 0.06, 0, 0.036, 0.004, 0.016, 0.036, 0.012, 0, 0.026, 0.024, 0, 0, 0.03, 0.104, 0.006, 0.016, 0.006, 0, 0, 0.02, 0.04, 0.194, 0.016, 0.178, 0.394, 0.258, 0.03, 0.012, 0, 0.02, 0.006, 0.558, 0, 0.736, 0.006, 0.634, 0.412, 0.012, 0.016, 0.318, 0, 0.036, 0, 0, 0.002, 0, 0.066, 0, 0.006, 0.004, 0.142, 0.156, 0, 0.202, 0.428, 0, 0.708, 0, 0.002, 0, 0.034, 0.14, 0.002, 0.032, 0.002, 0, 0.004, 0.04, 0, 0.014, 0.09, 0, 0, 0.012, 0, 0.048, 0.002, 0.648, 0, 0.244, 0.012, 0.04, 0.012, 0.008, 0.014, 0.002, 0.012, 0.004, 0, 0, 0.238, 0, 0, 0.03, 0.08, 0.004, 0.222, 0, 0, 0.002, 0.014, 0, 0.01, 0.036, 0.054, 0.05, 0.018, 0.022, 0.14, 0, 0.058, 0.088, 0.114, 0.008, 0.012, 0, 0.558, 0, 0.066, 0, 0, 0, 0.062, 0.032)
rpData <- c(rpData, 0.06, 0.464, 0, 0.096, 0, 0.02, 0.078, 0.024, 0.028, 0.128, 0.016, 0.044, 0.018, 0, 0.016, 0.16, 0, 0, 0, 0, 0.002, 0.004, 0.364, 0, 0, 0.304, 0, 0.004, 0.362, 0.01, 0.292, 0, 0.058, 0.028, 0.004, 0, 0, 0.088, 0.02, 0.01, 0, 0.014, 0.046, 0, 0.018, 0.134, 0, 0.074, 0, 0.002, 0.002, 0, 0, 0.004, 0.008, 0.104, 0.026, 0.016, 0, 0.046, 0.084, 0.096, 0.06, 0, 0.11, 0.222, 0, 0.012, 0.016, 0, 0.074, 0.068, 0, 0, 0, 0.07, 0, 0, 0.344, 0.35, 0, 0.166, 0.152, 0.002, 0.456, 0, 0, 0.034, 0.204, 0, 0, 0.038, 0, 0.026, 0, 0.006, 0.004, 0.002, 0.002, 0, 0, 0.012, 0, 0.054, 0.002, 0.132, 0.0769, 0.234, 0.004, 0.096, 0.008, 0.076, 0, 0.172, 0.012, 0, 0.038, 0.03, 0.176, 0, 0.022, 0, 0.108, 0.054, 0.114, 0, 0.264, 0, 0.074, 0, 0.05, 0.298, 0, 0, 0.346, 0, 0.056, 0, 0, 0.228, 0.002, 0, 0.006, 0, 0, 0, 0, 0.002, 0, 0.002, 0.004, 0.004, 0.002, 0.226, 0.002, 0.094, 0.528, 0, 0.004, 0.01, 0.026, 0.148, 0.074, 0, 0.214, 0.004, 0.026, 0.022, 0, 0.572, 0.08, 0.016, 0.004, 0.07, 0.188, 0.012, 0.026, 0, 0.024, 0, 0.426, 0, 0.488, 0.166, 0.134, 0.062, 0.086, 0.026, 0.002, 0.078, 0.166, 0, 0, 0.196, 0.004, 0, 0.056, 0.028, 0, 0.288, 0.096, 0.034, 0.05, 0.02, 0.254, 0.01, 0, 0.186, 0.138, 0, 0, 0.16, 0.006, 0, 0.296, 0.072, 0, 0.008, 0.256, 0, 0.134, 0.044, 0.146, 0.932, 0.064, 0, 0.004, 0.06, 0, 0.006, 0.094, 0.104, 0.006, 0.072, 0, 0.022, 0.058, 0.04, 0, 0.006, 0, 0.054, 0.08, 0.044, 0.014, 0.06, 0.004, 0, 0.088, 0.322, 0, 0.156, 0.01, 0, 0.554, 0.002, 0.156, 0.246, 0, 0.09, 0.086, 0, 0, 0.022, 0, 0.014, 0.25, 0.004, 0.296, 0.402, 0, 0.174, 0.006, 0.012, 0.18, 0.23, 0, 0.028, 0.106, 0, 0.12, 0.076, 0, 0, 0.05, 0.034, 0, 0.014, 0.024, 0.004, 0, 0.226, 0, 0.078, 0.024, 0.16, 0.042, 0.392, 0, 0.54, 0.204, 0.024, 0, 0.048, 0.01, 0, 0.112, 0.03, 0.006, 0.236, 0.094, 0.002, 0.004, 0.02, 0, 0.276, 0.482, 0.024, 0, 0, 0.026, 0, 0, 0.016, 0, 0.03, 0.012, 0.026, 0, 0.002, 0.036, 0.19, 0.008, 0, 0.546, 0.036, 0.002, 0, 0.002, 0.042, 0.104, 0.014, 0.298, 0, 0, 0, 0.004, 0.028, 0.006, 0.196, 0.388, 0, 0, 0.02, 0.456, 0.002, 0.016, 0.13, 0.16, 0.042, 0.098, 0.024, 0.294, 0, 0.054, 0.636, 0.008, 0.048, 0, 0.004, 0.044, 0.022, 0.022, 0.04, 0, 0.092, 0.028, 0.012, 0.006, 0.048, 0.022, 0.228, 0, 0.006, 0.01, 0.012, 0.35, 0.3, 0, 0.016, 0.03, 0, 0, 0.014, 0.026, 0.228, 0.008, 0.014, 0, 0.002, 0.132, 0, 0.142, 0.018, 0, 0.004, 0, 0.006, 0.068, 0, 0.012, 0, 0.018, 0.008, 0.106, 0.024, 0.612, 0, 0, 0.176, 0.006, 0.086, 0.086, 0, 0.002, 0, 0.006, 0.14, 0.002, 0.004, 0.002, 0.002, 0.212, 0.002, 0, 0.024, 0.072, 0.478, 0.024, 0.04, 0.134, 0.026, 0.002, 0, 0, 0.002, 0.02, 0, 0, 0.004, 0.116, 0.008, 0.05, 0.19, 0.05, 0.06, 0, 0.05, 0.128, 0, 0, 0.012, 0, 0.226, 0, 0.054, 0.042, 0, 0, 0.018, 0.12, 0.122, 0.03, 0, 0.016, 0.094, 0.024, 0.02, 0, 0.004, 0.064)
rpData <- c(rpData, 0.034, 0.146, 0.014, 0.178, 0.066, 0.082, 0.046, 0, 0.002, 0, 0.01, 0.032, 0.042, 0, 0, 0.366, 0.044, 0.102, 0.02, 0.034, 0.128, 0.062, 0.052, 0, 0.064, 0.004, 0, 0.018, 0.152, 0.128, 0.034, 0, 0.02, 0.04, 0.008, 0, 0.03, 0.54, 0.002, 0.002, 0.008, 0, 0.054, 0.232, 0, 0, 0.01, 0, 0.024, 0, 0.002, 0.036, 0.004, 0.12, 0.008, 0.016, 0.014, 0, 0, 0.276, 0.11, 0.004, 0.484, 0.008, 0.004, 0, 0.122, 0.096, 0.002, 0.028, 0.442, 0.334, 0.012, 0.012, 0.014, 0.036, 0.524, 0.662, 0.092, 0.072, 0, 0, 0.002, 0, 0.018, 0, 0.108, 0.026, 0.11, 0.042, 0.146, 0.018, 0.078, 0.142, 0.02, 0.022, 0, 0.018, 0.008, 0.072, 0, 0.012, 0, 0.068, 0.044, 0, 0.044, 0.04, 0.02, 0.004, 0, 0.16, 0.02, 0, 0.006, 0.37, 0.014, 0, 0, 0, 0.116, 0.004, 0.01, 0.002, 0.0769, 0.568, 0.006, 0, 0.072, 0.392, 0.188, 0.028, 0.03, 0, 0.03, 0.104, 0.006, 0.016, 0.088, 0.044, 0.006, 0, 0, 0.02, 0, 0.28, 0, 0.012, 0, 0.002, 0, 0.262, 0.06, 0, 0.148, 0.008, 0.122, 0, 0.02, 0, 0.052, 0, 0, 0.206, 0.002, 0.058, 0.274, 0, 0.014, 0, 0.002, 0.416, 0.044, 0.25, 0, 0, 0.442, 0.08, 0.148, 0.134, 0.06, 0, 0.286, 0, 0, 0.362, 0, 0.048, 0.008, 0.202, 0.092, 0.008, 0.004, 0.18, 0, 0.066, 0.234, 0, 0, 0, 0.052, 0, 0.072, 0, 0, 0.476, 0.002, 0.01, 0.122, 0.02, 0.234, 0.454, 0.08, 0, 0.15, 0.02, 0.05, 0.322, 0.078, 0.136, 0, 0.124, 0.126, 0.042, 0.012, 0, 0.084, 0.102, 0, 0.004, 0.01, 0.01, 0.04, 0.008, 0, 0.014, 0.026, 0.024, 0.534, 0.252, 0.012, 0.006, 0.044, 0, 0.422, 0.078, 0.046, 0.068, 0.236, 0.068, 0.068, 0.204, 0.018, 0, 0.09, 0, 0.004, 0.02, 0, 0, 0.246, 0, 0.01, 0.056, 0.088, 0, 0.052, 0.026, 0, 0, 0.102, 0.004, 0.116, 0.014, 0.01, 0.2, 0.102, 0, 0, 0, 0.216, 0, 0.01, 0.024, 0, 0.022, 0, 0, 0.002, 0.002, 0, 0, 0.22, 0.11, 0, 0, 0.316, 0, 0.042, 0.09, 0, 0.014, 0.182, 0, 0, 0.324, 0.002, 0, 0.108, 0.202, 0, 0.012, 0.092, 0.132, 0.02, 0.056, 0.014, 0.012, 0, 0.036, 0.028, 0.062, 0.264, 0.084, 0.194, 0, 0, 0.07, 0.004, 0.03, 0.124, 0.004, 0, 0.022, 0.006, 0.068, 0.03, 0.17, 0.008, 0.34, 0.056, 0.166, 0, 0.166, 0.004, 0.036, 0.148, 0.116, 0.028, 0, 0.01, 0, 0.062, 0.122, 0, 0, 0.004, 0.32, 0.168, 0.008, 0, 0, 0.064, 0, 0.002, 0.046, 0.008, 0.018, 0, 0, 0, 0.028, 0.004, 0.02, 0.084, 0.002, 0, 0, 0, 0.154, 0.084, 0.15, 0.002, 0, 0, 0.024, 0.448, 0, 0.164, 0.062, 0.022, 0.156, 0, 0.318, 0.074, 0.048, 0.196, 0.066, 0.062, 0.026, 0, 0.01, 0.09, 0.048, 0.112, 0.024, 0.024, 0.056, 0, 0.08, 0, 0, 0.016, 0, 0.01, 0.04, 0, 0, 0.026, 0.066, 0.01, 0.002, 0.006, 0, 0.032, 0.478, 0.006, 0, 0, 0.018, 0.02, 0.654, 0, 0.08, 0, 0.042, 0.004, 0.018, 0, 0.142, 0.07, 0, 0.042, 0.058, 0.222, 0, 0.002, 0.012, 0.022, 0.002, 0.232, 0, 0, 0.016, 0.21, 0.026, 0, 0.022, 0.004, 0.11, 0.09, 0.024, 0, 0.02, 0.114, 0.002, 0.004, 0.352, 0.034, 0.032, 0.05, 0.186, 0.034, 0.13, 0.06)
rpData <- c(rpData, 0.048, 0.234, 0.018, 0.002, 0.014, 0.006, 0, 0, 0.058, 0, 0, 0.01, 0.094, 0.024, 0.018, 0, 0.048, 0.104, 0.26, 0.776, 0, 0.116, 0.006, 0, 0.308, 0, 0.008, 0, 0.048, 0, 0.238, 0.032, 0.07, 0.002, 0.126, 0, 0.084, 0, 0.088, 0, 0.172, 0, 0.288, 0.074, 0, 0.032, 0, 0.402, 0.246, 0.166, 0.364, 0.15, 0.02, 0.146, 0.058, 0.192, 0, 0.058, 0, 0.268, 0.306, 0.136, 0.002, 0.01, 0.042, 0, 0.34, 0.006, 0, 0, 0, 0, 0.138, 0.028, 0.068, 0.048, 0.192, 0.018, 0.442, 0, 0, 0.466, 0.088, 0.004, 0.176, 0.012, 0.006, 0, 0.016, 0.06, 0.014, 0.246, 0, 0.024, 0.07, 0.002, 0, 0.234, 0.866, 0.014, 0.028, 0.004, 0.206, 0, 0.744, 0, 0, 0.004, 0, 0, 0.3, 0.194, 0.024, 0.058, 0.524, 0.002, 0, 0, 0, 0, 0.036, 0.138, 0, 0.046, 0.124, 0.04, 0.01, 0, 0.002, 0, 0.004, 0.46, 0.086, 0.104, 0.43, 0.094, 0.068, 0.08, 0.114, 0.112, 0.02, 0.212, 0.002, 0.064, 0.042, 0.046, 0.71, 0.028, 0.0769, 0, 0, 0, 0.012, 0.022, 0.004, 0.016, 0.038, 0, 0.01, 0.002, 0.546, 0.378, 0.036, 0.004, 0.07, 0.158, 0.09, 0, 0, 0.05, 0.004, 0, 0.14, 0.142, 0, 0.004, 0.004, 0.02, 0.05, 0.664, 0, 0.056, 0.046, 0.044, 0.004, 0.002, 0, 0.022, 0, 0, 0.016, 0, 0.008, 0.048, 0, 0.138, 0.06, 0.002, 0.02, 0.028, 0.21, 0.018, 0.084, 0.028, 0.034, 0, 0, 0.068, 0.174, 0.054, 0, 0.078, 0, 0.178, 0.03, 0.006, 0.06, 0.022, 0, 0, 0.068, 0, 0.002, 0, 0.004, 0, 0.01, 0.482, 0.002, 0.022, 0.052, 0.024, 0, 0.018, 0, 0.018, 0, 0.338, 0.122, 0.008, 0.008, 0.05, 0.002, 0, 0.012, 0.034, 0.164, 0.216, 0, 0.054, 0.028, 0.15, 0.608, 0.04, 0, 0.738, 0.002, 0.084, 0.034, 0.012, 0, 0.002, 0.01, 0, 0, 0, 0.184, 0.002, 0.01, 0.038, 0, 0.368, 0.01, 0.092, 0.23, 0.028, 0.002, 0.01, 0.042, 0.306, 0.01, 0, 0, 0.078, 0.022, 0.264, 0, 0.03, 0.018, 0.016, 0.042, 0.024, 0, 0.038, 0.16, 0.084, 0.012, 0.014, 0, 0.196, 0, 0.014, 0.004, 0.034, 0.08, 0.104, 0.002, 0, 0, 0.094, 0.056, 0.04, 0.208, 0.15, 0.122, 0.004, 0.014, 0.004, 0.014, 0.228, 0.038, 0.724, 0, 0, 0.486, 0, 0.26, 0, 0.01, 0.02, 0.008, 0.588, 0, 0.006, 0.026, 0, 0.372, 0.112, 0.014, 0, 0.028, 0.002, 0.006, 0.006, 0.002, 0.006, 0.006, 0.006, 0, 0, 0.09, 0.078, 0.026, 0.012, 0.044, 0.26, 0.082, 0, 0.102, 0.16, 0, 0, 0, 0.02, 0.032, 0.002, 0.024, 0.076, 0.05, 0.04, 0.018, 0.004, 0.374, 0, 0.062, 0.678, 0, 0.042, 0, 0, 0.02, 0.064, 0.048, 0.066, 0, 0.266, 0, 0.014, 0.008, 0.536, 0, 0.084, 0.068, 0.106, 0.09, 0.13, 0.012, 0, 0.06, 0.026, 0.048, 0.004, 0, 0, 0.046, 0.158, 0.008, 0.026, 0.184, 0.256, 0.384, 0.046, 0.172, 0.004, 0.024, 0.016, 0.002, 0.08, 0, 0.126, 0.004, 0.53, 0.004, 0, 0.244, 0.004, 0.002, 0, 0, 0.236, 0, 0.038, 0, 0, 0.168, 0.094, 0, 0.002, 0.098, 0, 0.124, 0.05, 0, 0.02, 0, 0.046, 0.138, 0, 0.082, 0, 0.002, 0.034, 0.05, 0.084, 0.146, 0, 0, 0.676, 0, 0.008, 0, 0, 0, 0.004, 0.118, 0.062, 0.06, 0.002, 0, 0.008, 0.032, 0.016)
rpData <- c(rpData, 0.724, 0, 0.1, 0, 0.008, 0.174, 0.008, 0.014, 0, 0.004, 0, 0.024, 0.17, 0, 0.002, 0.008, 0, 0.024, 0.002, 0, 0.006, 0.022, 0.008, 0.012, 0, 0.03, 0.006, 0.142, 0.186, 0, 0.002, 0.002, 0.016, 0.05, 0.012, 0.002, 0.23, 0.402, 0.046, 0, 0.018, 0.002, 0, 0.098, 0.004, 0, 0, 0.122, 0, 0.024, 0.06, 0.024, 0, 0, 0, 0.034, 0.112, 0.012, 0.048, 0.008, 0.008, 0.028, 0.064, 0.004, 0, 0.014, 0, 0.12, 0.11, 0.174, 0.014, 0, 0, 0, 0, 0.016, 0.24, 0.16, 0.04, 0.012, 0.132, 0, 0, 0.006, 0.268, 0, 0, 0.002, 0, 0.028, 0, 0.01, 0, 0, 0, 0, 0.018, 0.034, 0.17, 0.038, 0.03, 0.11, 0.088, 0.498, 0, 0, 0, 0.066, 0.1, 0.066, 0.09, 0, 0.012, 0.008, 0.408, 0.024, 0.802, 0.004, 0, 0.006, 0.006, 0.164, 0.03, 0.16, 0.078, 0, 0.004, 0.094, 0.022, 0.082, 0.002, 0.012, 0.008, 0.28, 0.008, 0.008, 0.02, 0.006, 0.37, 0.106, 0.012, 0.1, 0.004, 0.572, 0.016, 0, 0.002, 0.08, 0.01, 0.046, 0.002, 0.088, 0.026, 0.052, 0.198, 0.03, 0.03, 0.172, 0.006, 0, 0.154, 0.008, 0.236, 0.01, 0.072, 0.01, 0.088, 0, 0.51, 0.002, 0.124, 0.206, 0.04, 0.07, 0.258, 0.0769, 0, 0, 0.44, 0.076, 0.002, 0, 0.026, 0.034, 0, 0, 0.002, 0.004, 0.06, 0.502, 0.01, 0.152, 0, 0.174, 0, 0.026, 0.04, 0.556, 0.006, 0.076, 0.018, 0, 0.298, 0, 0.02, 0.002, 0.028, 0.086, 0, 0.004, 0, 0.026, 0, 0, 0.002, 0, 0, 0.002, 0.038, 0.23, 0.218, 0.166, 0.028, 0.224, 0.192, 0.032, 0, 0, 0.326, 0.008, 0.068, 0, 0.002, 0.1, 0.002, 0, 0.056, 0.056, 0.004, 0.146, 0.292, 0.002, 0.004, 0.096, 0.096, 0.4, 0, 0, 0, 0.184, 0.366, 0.17, 0.458, 0.446, 0, 0.014, 0.014, 0.096, 0.07, 0, 0.086, 0.092, 0.008, 0.042, 0, 0.006, 0.354, 0.058, 0.03, 0.746, 0.06, 0.158, 0.026, 0, 0.304, 0.012, 0.014, 0.09, 0.02, 0, 0.32, 0, 0.03, 0.05, 0.238, 0, 0.006, 0.012, 0, 0, 0, 0.116, 0, 0.008, 0, 0, 0.03, 0.232, 0.058, 0, 0.114, 0.024, 0.37, 0, 0, 0, 0.002, 0.378, 0.144, 0, 0.022, 0.068, 0, 0.672, 0.092, 0.054, 0.644, 0.196, 0.066, 0, 0.006, 0.032, 0.028, 0.006, 0.012, 0.002, 0.002, 0.002, 0, 0.002, 0, 0.028, 0.244, 0.692, 0, 0.016, 0.354, 0.032, 0, 0.316, 0.16, 0.126, 0.03, 0.086, 0.064, 0.052, 0.176, 0, 0.09, 0.688, 0.004, 0.01, 0.004, 0, 0.038, 0, 0.054, 0.466, 0.182, 0.11, 0.01, 0.19, 0.024, 0.032, 0.094, 0, 0.028, 0.006, 0, 0, 0.046, 0.41, 0.278, 0.156, 0, 0.04, 0, 0.012, 0.014, 0.088, 0.028, 0.402, 0.098, 0, 0, 0.012, 0.04, 0.442, 0.02, 0.13, 0.002, 0.132, 0.116, 0.002, 0.04, 0.018, 0.16, 0.264, 0.048, 0.086, 0, 0.106, 0, 0.038, 0.002, 0.006, 0, 0.02, 0, 0.016, 0.08, 0.074, 0.17, 0.062, 0.012, 0.002, 0.014, 0, 0, 0.002, 0.004, 0.116, 0.016, 0, 0.494, 0.004, 0.158, 0, 0.006, 0, 0.004, 0.004, 0.054, 0.002, 0.02, 0.11, 0.054, 0, 0, 0.044, 0.28, 0.002, 0, 0.018, 0.072, 0.134, 0.008, 0.01, 0.012, 0.1, 0, 0, 0.06, 0.358, 0, 0, 0, 0.002, 0.15, 0.472, 0.208, 0.058, 0.002, 0.068, 0.01, 0.076, 0.002, 0.608, 0.084, 0)
rpData <- c(rpData, 0.728, 0.084, 0.002, 0.106, 0.608, 0.01, 0.008, 0.04, 0.226, 0.108, 0.16, 0.03, 0, 0.018, 0.044, 0, 0.018, 0.002, 0.114, 0, 0.078, 0, 0.276, 0.024, 0, 0.028, 0.048, 0.07, 0.012, 0, 0, 0.244, 0.078, 0.03, 0.072, 0, 0.064, 0.206, 0.066, 0, 0.26, 0.096, 0.02, 0.086, 0.022, 0.414, 0.004, 0, 0.02, 0.018, 0.39, 0.018, 0.13, 0.008, 0.174, 0, 0.012, 0.124, 0.1, 0.144, 0.138, 0.004, 0.008, 0.014, 0.008, 0.566, 0.092, 0.014, 0.078, 0, 0.152, 0.006, 0.014, 0.086, 0, 0.08, 0.006, 0.008, 0, 0, 0.008, 0.034, 0.01, 0, 0.018, 0.148, 0.008, 0.33, 0.628, 0.478, 0, 0, 0.102, 0.626, 0.44, 0, 0, 0.058, 0.026, 0.056, 0, 0.012, 0.07, 0, 0.192, 0.014, 0, 0.042, 0.036, 0.074, 0, 0.042, 0.006, 0.16, 0.29, 0.264, 0.184, 0.012, 0.272, 0.302, 0.048, 0.026, 0.004, 0.014, 0, 0.05, 0.022, 0.02, 0.006, 0.006, 0, 0, 0.08, 0.014, 0.078, 0.096, 0, 0.226, 0.182, 0.004, 0.1, 0.204, 0, 0.026, 0, 0.026, 0.04, 0, 0, 0.8, 0.006, 0.056, 0.324, 0.218, 0.014, 0.028, 0, 0, 0.014, 0.006, 0.014, 0.144, 0.038, 0.426, 0.06, 0.028, 0.002, 0.338, 0, 0.22, 0.008, 0.092, 0, 0, 0.688, 0.006, 0.02, 0.002, 0.058, 0, 0.112, 0.09, 0.03, 0, 0.064, 0.028, 0.032, 0.052, 0.182, 0.068, 0.038, 0.006, 0.046, 0.008, 0, 0, 0.01, 0.006, 0, 0.014, 0.21, 0.07, 0.002, 0.146, 0.0769, 0.038, 0, 0, 0.052, 0.04, 0, 0.14, 0.082, 0.378, 0.004, 0.03, 0.01, 0, 0.052, 0.336, 0.054, 0.302, 0.018, 0, 0.092, 0, 0.014, 0.376, 0.004, 0.042, 0.226, 0, 0, 0.772, 0.2, 0.002, 0.136, 0.212, 0.132, 0.582, 0, 0.102, 0.422, 0.754, 0, 0.04, 0.022, 0.712, 0.014, 0.004, 0.062, 0.002, 0.048, 0.164, 0.05, 0.24, 0, 0.052, 0, 0.01, 0.26, 0.002, 0.006, 0.008, 0.01, 0.212, 0, 0.002, 0, 0.218, 0.012, 0.138, 0, 0.014, 0.254, 0.004, 0.27, 0.264, 0, 0.066, 0.094, 0.014, 0.086, 0.022, 0.018, 0.228, 0, 0.006, 0.018, 0.006, 0, 0.084, 0, 0.01, 0.038, 0, 0.006, 0.014, 0, 0.21, 0.04, 0.012, 0.114, 0.316, 0.734, 0, 0.114, 0.008, 0.002, 0.082, 0, 0, 0.024, 0.218, 0, 0, 0.37, 0.38, 0.036, 0.014, 0.15, 0.006, 0.022, 0, 0.066, 0.014, 0, 0.002, 0.038, 0, 0.02, 0.022, 0.046, 0, 0.028, 0.002, 0, 0.608, 0, 0.744, 0.004, 0.068, 0, 0.31, 0.48, 0.008, 0.008, 0.014, 0.098, 0.02, 0, 0.18, 0, 0, 0.004, 0.014, 0.092, 0.316, 0, 0.024, 0.324, 0.56, 0, 0, 0, 0, 0, 0, 0.032, 0.088, 0.002, 0.014, 0.092, 0.06, 0.002, 0.042, 0.012, 0, 0.002, 0, 0.45, 0.026, 0.064, 0.004, 0, 0.016, 0.046, 0, 0.018, 0, 0.14, 0, 0, 0.03, 0.124, 0, 0.006, 0.006, 0.054, 0.224, 0.024, 0.082, 0.006, 0.064, 0.284, 0.322, 0.012, 0, 0.252, 0.022, 0.272, 0.028, 0.414, 0, 0.004, 0.002, 0, 0.002, 0, 0.404, 0.016, 0.068, 0.04, 0.006, 0.118, 0.02, 0.13, 0.04, 0.136, 0, 0.168, 0, 0, 0.07, 0.016, 0, 0.042, 0.148, 0.12, 0.006, 0, 0, 0, 0.016, 0.1, 0.016, 0.012, 0, 0.002, 0.05, 0.188, 0.05, 0.002, 0, 0.016, 0.196, 0, 0, 0.074, 0.134, 0.35, 0.084, 0.062, 0, 0.04)
rpData <- c(rpData, 0.09, 0.062, 0, 0.004, 0.008, 0.02, 0.14, 0.004, 0.23, 0, 0, 0.12, 0.014, 0, 0, 0, 0.02, 0, 0, 0, 0.074, 0.012, 0.708, 0.014, 0.072, 0.138, 0.084, 0.364, 0.052, 0.052, 0.224, 0.004, 0.06, 0.048, 0.02, 0.116, 0, 0.034, 0.11, 0, 0, 0.056, 0, 0.008, 0, 0.048, 0.062, 0.076, 0.07, 0.026, 0.032, 0.47, 0, 0.556, 0.002, 0.224, 0, 0.092, 0.164, 0.018, 0.01, 0.01, 0.032, 0.012, 0.142, 0, 0.068, 0.638, 0.006, 0.062, 0.1, 0.212, 0.204, 0.002, 0.004, 0.084, 0.016, 0.004, 0.054, 0.004, 0.168, 0, 0.054, 0.008, 0, 0.012, 0.078, 0.006, 0.086, 0, 0.116, 0.184, 0.02, 0.456, 0.026, 0.022, 0, 0.236, 0, 0.09, 0, 0.3, 0.014, 0.056, 0.026, 0.096, 0.102, 0.022, 0.002, 0, 0.144, 0.004, 0, 0.002, 0, 0.048, 0.054, 0.052, 0.042, 0.1, 0.012, 0.018, 0.048, 0.254, 0.096, 0, 0.002, 0.068, 0, 0.084, 0.008, 0.024, 0.03, 0.038, 0.048, 0.024, 0.092, 0, 0.012, 0.124, 0.088, 0.006, 0.226, 0.154, 0.244, 0.052, 0.028, 0.016, 0.124, 0, 0.352, 0.168, 0, 0.056, 0.006, 0.05, 0, 0.012, 0.008, 0, 0, 0.088, 0.018, 0, 0.062, 0.006, 0.012, 0, 0.026, 0, 0.044, 0, 0.1, 0.186, 0.234, 0.004, 0, 0.236, 0.006, 0, 0.038, 0.004, 0, 0.012, 0, 0.182, 0.076, 0, 0, 0.004, 0, 0.088, 0.04, 0.276, 0, 0.006, 0.034, 0.046, 0.078, 0.55, 0.004, 0, 0.004, 0, 0, 0.256, 0.132, 0.028, 0, 0.072, 0.268, 0.008, 0, 0, 0, 0, 0.042, 0.03, 0.216, 0.042, 0.004, 0.046, 0, 0.036, 0, 0.134, 0, 0.256, 0.05, 0, 0.486, 0, 0.008, 0.026, 0, 0.034, 0, 0.018, 0.0769, 0.008, 0.002, 0, 0.014, 0.004, 0, 0.058, 0.022, 0, 0, 0.16, 0.006, 0, 0.096, 0.004, 0.012, 0, 0.004, 0.014, 0, 0.004, 0.012, 0.122, 0.062, 0.006, 0.324, 0.294, 0, 0, 0.038, 0.002, 0.002, 0.016, 0, 0.006, 0.03, 0.002, 0, 0, 0.016, 0.432, 0.006, 0, 0.124, 0, 0.032, 0.024, 0.002, 0.056, 0, 0.216, 0.034, 0.198, 0.02, 0.002, 0.036, 0.02, 0.014, 0.004, 0.08, 0.032, 0, 0.014, 0, 0.002, 0, 0.05, 0.198, 0.224, 0.008, 0, 0.082, 0, 0.112, 0, 0, 0.088, 0.06, 0.046, 0.146, 0, 0.048, 0.002, 0, 0.012, 0, 0.034, 0.02, 0.562, 0.018, 0.086, 0, 0.098, 0, 0.016, 0.042, 0.148, 0.03, 0, 0.002, 0, 0.06, 0.082, 0.028, 0, 0.016, 0.062, 0.056, 0.006, 0, 0, 0.002, 0, 0.008, 0.128, 0.248, 0.002, 0.008, 0.316, 0, 0.324, 0.096, 0.07, 0.106, 0.174, 0.014, 0.018, 0.156, 0, 0.002, 0.082, 0.006, 0.162, 0.034, 0, 0.038, 0.004, 0.194, 0.042, 0.002, 0.142, 0.01, 0.02, 0.048, 0.016, 0.24, 0.014, 0.038, 0.332, 0.02, 0.178, 0, 0.104, 0.02, 0.002, 0.192, 0.004, 0, 0, 0.262, 0, 0.026, 0, 0.002, 0, 0.002, 0.35, 0.182, 0.162, 0.002, 0, 0.016, 0.018, 0.028, 0.352, 0.024, 0, 0, 0.004, 0, 0, 0.032, 0.004, 0, 0.108, 0, 0.046, 0.366, 0.044, 0.314, 0, 0, 0.086, 0, 0.148, 0, 0, 0.014, 0, 0.102, 0, 0, 0.16, 0.024, 0.014, 0, 0.086, 0, 0.076, 0, 0.018, 0.004, 0, 0.218, 0.008, 0, 0.01, 0.21, 0.002, 0.13, 0.094, 0.022, 0.006, 0.002, 0.07, 0.022, 0, 0, 0.05, 0.81, 0.008)
rpData <- c(rpData, 0.06, 0, 0.032, 0.024, 0.03, 0.006, 0.062, 0.006, 0, 0.076, 0, 0.018, 0, 0.01, 0.046, 0.03, 0.094, 0.232, 0.058, 0.02, 0.238, 0.096, 0.008, 0, 0.004, 0, 0.002, 0.038, 0.036, 0.088, 0.06, 0.022, 0.048, 0.008, 0.008, 0.016, 0.102, 0, 0.006, 0.016, 0.094, 0.056, 0.044, 0, 0.026, 0.486, 0.254, 0, 0.006, 0.004, 0.004, 0.002, 0, 0, 0.346, 0.416, 0.004, 0.314, 0.21, 0.048, 0.006, 0.096, 0.228, 0.058, 0.014, 0.006, 0, 0, 0.02, 0.046, 0.006, 0, 0.01, 0.302, 0.11, 0.052, 0, 0.006, 0.322, 0.028, 0, 0.612, 0, 0.008, 0, 0, 0.012, 0.452, 0.002, 0.582, 0.168, 0.052, 0.238, 0.02, 0.052, 0.002, 0.006, 0.006, 0, 0.08, 0.018, 0.012, 0, 0.002, 0.088, 0.452, 0, 0.05, 0.402, 0.016, 0.726, 0.122, 0.232, 0.218, 0.026, 0.01, 0.01, 0.126, 0, 0.004, 0.018, 0.024, 0.052, 0, 0, 0.002, 0.428, 0.014, 0, 0.092, 0, 0.048, 0.038, 0, 0.064, 0.002, 0.2, 0.146, 0.002, 0.58, 0.624, 0.62, 0.032, 0.352, 0.01, 0.178, 0, 0.026, 0.044, 0.03, 0.008, 0.136, 0, 0.002, 0.002, 0.096, 0.146, 0.042, 0, 0.012, 0.288, 0.256, 0.06, 0.002, 0.018, 0, 0, 0, 0.072, 0.004, 0.044, 0.028, 0.08, 0.028, 0.046, 0.044, 0.016, 0.002, 0.092, 0.02, 0.002, 0.498, 0.054, 0.004, 0.83, 0, 0.06, 0.03, 0, 0.05, 0, 0.012, 0.046, 0.006, 0.058, 0, 0.066, 0.004, 0.158, 0.596, 0, 0, 0, 0.004, 0.008, 0, 0.008, 0.28, 0, 0.048, 0.006, 0.012, 0, 0.196, 0, 0.064, 0, 0.05, 0.026, 0.154, 0, 0.016, 0.026, 0.014, 0.8, 0.1, 0.03, 0, 0.004, 0.058, 0.012, 0.06, 0.14, 0, 0.276, 0.004, 0.006, 0, 0, 0, 0.052, 0.208, 0, 0.06, 0.198, 0, 0.08, 0.082, 0.016, 0.008, 0.002, 0.036, 0.036, 0, 0.016, 0.696, 0.064, 0, 0.07, 0, 0.244, 0.43, 0.042, 0.038, 0.02, 0, 0.004, 0.0769, 0.03, 0, 0.092, 0.018, 0.126, 0, 0.018, 0.164, 0, 0.024, 0.002, 0.022, 0, 0.002, 0.03, 0.058, 0.052, 0.138, 0.114, 0.108, 0.004, 0, 0.006, 0.554, 0.15, 0.002, 0, 0.244, 0.006, 0.408, 0, 0.01, 0.002, 0.01, 0, 0.362, 0.11, 0.056, 0, 0.024, 0, 0.48, 0, 0.01, 0, 0.276, 0, 0.094, 0.124, 0, 0, 0.03, 0.04, 0.036, 0.248, 0.18, 0.126, 0, 0.068, 0.078, 0, 0, 0.022, 0.252, 0.002, 0.038, 0.3, 0.012, 0.14, 0.304, 0, 0, 0, 0.004, 0.024, 0.002, 0, 0.032, 0.002, 0, 0.044, 0.042, 0.092, 0.016, 0.122, 0.452, 0.108, 0.084, 0.03, 0.012, 0, 0, 0.036, 0, 0.236, 0.008, 0.022, 0.006, 0.004, 0.008, 0, 0.016, 0, 0.002, 0.016, 0, 0.042, 0.284, 0.034, 0, 0.134, 0.034, 0, 0.04, 0.136, 0.004, 0.004, 0.26, 0.184, 0.014, 0.018, 0.31, 0.004, 0.21, 0.07, 0.042, 0, 0.248, 0.676, 0.004, 0.008, 0, 0, 0.048, 0.004, 0.022, 0.03, 0.004, 0, 0.006, 0.002, 0.002, 0.178, 0.07, 0.044, 0.024, 0.154, 0.036, 0.168, 0, 0.074, 0.114, 0.054, 0.03, 0.252, 0.016, 0, 0, 0.904, 0.008, 0.014, 0.03, 0.442, 0.028, 0.006, 0, 0, 0.008, 0.022, 0.17, 0, 0.416, 0, 0, 0.038, 0, 0.132, 0, 0.728, 0.066, 0.014, 0.062, 0.018, 0, 0.006, 0.02, 0.382, 0.046, 0.168, 0.06, 0.37, 0.22, 0.016, 0, 0.044)
rpData <- c(rpData, 0, 0, 0.016, 0, 0.006, 0, 0.06, 0.002, 0.114, 0.042, 0.018, 0.006, 0.296, 0, 0, 0.158, 0, 0.018, 0, 0.132, 0, 0.124, 0.014, 0.272, 0.502, 0.408, 0.044, 0.054, 0.004, 0.004, 0.062, 0.014, 0.012, 0.482, 0.008, 0, 0.012, 0, 0.17, 0, 0.012, 0.514, 0, 0.176, 0, 0.014, 0.63, 0, 0, 0.244, 0.378, 0.004, 0.198, 0.008, 0.054, 0.27, 0.026, 0.006, 0, 0, 0.002, 0.216, 0.006, 0, 0.008, 0.188, 0, 0.032, 0.024, 0.094, 0, 0.002, 0.002, 0.004, 0.07, 0.022, 0.258, 0.146, 0, 0.258, 0, 0.004, 0.096, 0, 0, 0.038, 0, 0.012, 0.35, 0.22, 0, 0, 0.04, 0.004, 0.034, 0.004, 0.016, 0.116, 0.238, 0, 0, 0.004, 0.104, 0, 0.164, 0, 0.002, 0, 0.018, 0.002, 0.238, 0.178, 0.026, 0, 0.008, 0, 0.006, 0, 0.004, 0.014, 0, 0, 0.15, 0, 0.358, 0.132, 0, 0.144, 0, 0, 0.078, 0.006, 0.26, 0.136, 0.004, 0.002, 0, 0.022, 0.392, 0.084, 0.02, 0.03, 0.002, 0.128, 0.044, 0, 0.002, 0, 0.006, 0.092, 0.02, 0.136, 0.016, 0.106, 0.46, 0.148, 0.006, 0, 0, 0.002, 0, 0.032, 0, 0, 0, 0, 0.008, 0.424, 0.002, 0.116, 0, 0.092, 0.142, 0.03, 0, 0.028, 0, 0, 0.03, 0.038, 0.57, 0, 0.014, 0.002, 0.024, 0.146, 0, 0.066, 0, 0.026, 0.002, 0.012, 0.072, 0.016, 0.014, 0.102, 0.026, 0.05, 0.132, 0.086, 0.092, 0.006, 0, 0.064, 0.062, 0.004, 0.008, 0.168, 0.006, 0.154, 0.248, 0, 0.18, 0.118, 0.002, 0.11, 0.034, 0.018, 0.096, 0.046, 0, 0.066, 0.02, 0.632, 0, 0.174, 0.012, 0.178, 0.156, 0.002, 0.048, 0.004, 0.046, 0.098, 0.016, 0.008, 0.03, 0, 0.11, 0.304, 0.118, 0.074, 0, 0.526, 0.008, 0.036, 0.026, 0.052, 0.024, 0.012, 0.006, 0.07, 0.218, 0.016, 0.066, 0, 0.002, 0.046, 0.09, 0.058, 0.002, 0.004, 0.102, 0.1, 0, 0.294, 0, 0.04, 0.036, 0, 0.002, 0.152, 0, 0.012, 0.334, 0.024, 0.074, 0.18, 0.024, 0.006, 0.05, 0, 0.004, 0.068, 0, 0.146, 0.002, 0.09, 0.082, 0.008, 0.03, 0, 0.03, 0.002, 0.004, 0, 0, 0.036, 0.01, 0, 0.098, 0.192, 0, 0.0769, 0, 0, 0, 0.032, 0.056, 0, 0.038, 0.026, 0, 0.592, 0.414, 0.214, 0.094, 0, 0.214, 0.018, 0.028, 0.032, 0.068, 0, 0, 0.088, 0.004, 0.01, 0.056, 0.046, 0.002, 0, 0.002, 0.002, 0.158, 0, 0, 0.026, 0.052, 0.212, 0.038, 0, 0, 0.676, 0, 0.004, 0, 0, 0.06, 0.006, 0.002, 0.002, 0, 0.002, 0, 0, 0.006, 0.28, 0.002, 0.046, 0, 0.004, 0.43, 0, 0.154, 0, 0.002, 0.016, 0.018, 0.008, 0.082, 0.182, 0.138, 0, 0.888, 0.012, 0.206, 0.048, 0, 0.152, 0.008, 0, 0.004, 0.002, 0.008, 0.194, 0.134, 0.048, 0.026, 0, 0, 0.028, 0.01, 0.214, 0.356, 0, 0.02, 0.008, 0.064, 0.648, 0.002, 0, 0.006, 0.006, 0, 0.036, 0.018, 0.002, 0.068, 0.962, 0, 0.018, 0.108, 0, 0, 0.03, 0.452, 0.044, 0.346, 0.024, 0, 0.016, 0, 0, 0, 0.098, 0.022, 0.038, 0.104, 0.112, 0.028, 0, 0.06, 0, 0.026, 0.01, 0.042, 0.46, 0.006, 0.088, 0.094, 0, 0.004, 0.012, 0.068, 0.036, 0.178, 0.002, 0.172, 0.054, 0.06, 0.002, 0, 0.016, 0, 0.032, 0.108, 0.322, 0, 0.004, 0.004, 0, 0.026, 0.05, 0.048, 0.008, 0.034)
rpData <- c(rpData, 0.028, 0.008, 0.114, 0.08, 0.004, 0.026, 0.004, 0.468, 0.042, 0.008, 0.01, 0.022, 0.004, 0, 0.004, 0.01, 0.406, 0.026, 0.106, 0.014, 0, 0.252, 0, 0.052, 0, 0, 0.014, 0, 0.04, 0.004, 0.362, 0.016, 0.004, 0.146, 0.128, 0.014, 0.156, 0.032, 0.12, 0.01, 0.028, 0.002, 0, 0, 0.152, 0.076, 0.144, 0.324, 0.404, 0.044, 0.008, 0, 0.36, 0.186, 0.002, 0.002, 0.002, 0.05, 0.272, 0.09, 0.028, 0, 0.022, 0.236, 0, 0.168, 0, 0.162, 0.052, 0.006, 0, 0.436, 0, 0, 0.006, 0, 0.14, 0.006, 0, 0.396, 0.022, 0.174, 0, 0.044, 0, 0.016, 0.06, 0.066, 0.428, 0.002, 0.286, 0, 0.002, 0.068, 0, 0.052, 0.12, 0.006, 0, 0.108, 0, 0.398, 0.454, 0.014, 0, 0.002, 0, 0, 0.004, 0.004, 0.266, 0, 0.212, 0, 0, 0.19, 0.164, 0.02, 0.004, 0, 0, 0, 0, 0.03, 0.006, 0, 0.162, 0.002, 0.058, 0.002, 0.044, 0.062, 0.41, 0.114, 0.574, 0.018, 0.048, 0.01, 0.006, 0.038, 0.004, 0.004, 0.01, 0, 0.388, 0.044, 0.002, 0.006, 0.004, 0.006, 0.046, 0.378, 0, 0, 0.278, 0, 0, 0.002, 0.012, 0.002, 0, 0.004, 0.022, 0.02, 0, 0.01, 0.502, 0.004, 0, 0.014, 0.002, 0.112, 0.032, 0.026, 0.06, 0.334, 0, 0.024, 0.2, 0.188, 0.19, 0.114, 0.048, 0.08, 0.068, 0.128, 0.026, 0.006, 0.024, 0.026, 0, 0.02, 0.116, 0.592, 0, 0.572, 0.004, 0.042, 0.186, 0, 0.004, 0, 0.004, 0.008, 0.006, 0.036, 0, 0, 0.004, 0, 0, 0.038, 0.072, 0, 0.026, 0.082, 0.052, 0.176, 0.29, 0, 0.05, 0.08, 0, 0.452, 0, 0.11, 0.018, 0, 0.14, 0, 0.002, 0, 0.008, 0.002, 0, 0.008, 0.122, 0.13, 0.288, 0.01, 0.004, 0, 0.356, 0, 0, 0, 0.008, 0, 0.008, 0.026, 0, 0.002, 0.078, 0, 0.292, 0, 0, 0.028, 0.004, 0.002, 0.25, 0.004, 0.052, 0.002, 0.006, 0.016, 0.562, 0.006, 0.09, 0.542, 0.012, 0, 0.002, 0.256, 0, 0.004, 0, 0.522, 0, 0.586, 0, 0.034, 0.098, 0, 0, 0, 0.668, 0.016, 0, 0.004, 0.004, 0.002, 0, 0.174, 0.022, 0, 0.08, 0.024, 0.004, 0, 0.246, 0.032, 0.002, 0, 0.552, 0.004, 0.106, 0.008, 0.102, 0.14, 0.598, 0.026, 0, 0.208, 0, 0.19, 0.002, 0.012, 0.078, 0, 0.02, 0, 0, 0.082, 0.296, 0, 0.102, 0.118, 0, 0.346, 0.058, 0.32, 0.078, 0.024, 0.074, 0.0769, 0.004, 0.026, 0.362, 0.552, 0.008, 0.488, 0.136, 0.102, 0.004, 0.134, 0.086, 0, 0, 0, 0.048, 0.198, 0.024, 0.006, 0, 0.294, 0.592, 0.022, 0.08, 0.096, 0.006, 0, 0.026, 0.012, 0, 0.008, 0.064, 0, 0.358, 0.008, 0.008, 0.206, 0.038, 0.354, 0, 0.152, 0.032, 0.038, 0.146, 0.06, 0, 0.268, 0.032, 0, 0, 0, 0.212, 0.264, 0.018, 0, 0.016, 0.206, 0.168, 0.116, 0.094, 0, 0.008, 0.822, 0.044, 0.002, 0.02, 0.644, 0.03, 0.094, 0.004, 0.01, 0, 0, 0.06, 0, 0.092, 0.004, 0.224, 0.024, 0.36, 0.396, 0, 0.02, 0.444, 0, 0.064, 0.028, 0.002, 0.132, 0.036, 0.006, 0.132, 0, 0.002, 0, 0.094, 0.004, 0.004, 0.002, 0, 0.048, 0, 0.002, 0, 0.036, 0.052, 0.002, 0.016, 0.094, 0.024, 0.292, 0, 0.042, 0.064, 0.01, 0.042, 0.038, 0.002, 0.398, 0.178, 0.02, 0.122, 0.004, 0.014, 0.05, 0.122)
rpData <- c(rpData, 0.046, 0.086, 0, 0.014, 0.398, 0.504, 0, 0, 0, 0, 0.006, 0, 0, 0.178, 0.014, 0, 0.028, 0.02, 0.142, 0.3, 0.062, 0.036, 0.006, 0.07, 0.578, 0.006, 0.228, 0.002, 0, 0.024, 0.06, 0.038, 0.072, 0, 0.002, 0.4, 0, 0, 0, 0.35, 0.244, 0.47, 0.014, 0.128, 0.018, 0.026, 0.012, 0, 0, 0.098, 0.016, 0.73, 0.036, 0, 0.004, 0.07, 0.038, 0.034, 0.01, 0.032, 0, 0.196, 0.044, 0.024, 0.02, 0.004, 0.002, 0.062, 0, 0.026, 0.528, 0.052, 0.07, 0.342, 0, 0.138, 0.006, 0.31, 0.14, 0.738, 0.008, 0.15, 0.052, 0, 0, 0.012, 0, 0.18, 0.012, 0.062, 0.006, 0.164, 0.004, 0.048, 0.002, 0.004, 0.042, 0.028, 0.058, 0.046, 0.23, 0.038, 0.032, 0.01, 0.018, 0.166, 0.154, 0.076, 0, 0.002, 0, 0, 0.806, 0.042, 0, 0.048, 0, 0.178, 0.522, 0, 0, 0, 0.052, 0, 0.108, 0.04, 0.01, 0.044, 0.008, 0.046, 0.096, 0.024, 0.026, 0, 0, 0.014, 0, 0.004, 0.072, 0, 0.074, 0.114, 0.124, 0.008, 0.014, 0.002, 0.04, 0.016, 0.002, 0, 0, 0, 0.018, 0.414, 0.112, 0, 0, 0, 0.032, 0.332, 0.01, 0.282, 0.452, 0.342, 0, 0.004, 0.002, 0.02, 0.004, 0, 0.004, 0.662, 0, 0.002, 0.026, 0.018, 0.054, 0, 0, 0, 0.096, 0, 0, 0, 0, 0.096, 0, 0, 0.044, 0.088, 0, 0.346, 0.198, 0, 0.004, 0.006, 0.088, 0.25, 0, 0.036, 0, 0.002, 0, 0.14, 0, 0.17, 0, 0.162, 0.002, 0.014, 0, 0, 0.092, 0, 0, 0, 0, 0.262, 0.09, 0.058, 0.094, 0.002, 0, 0.006, 0.032, 0.008, 0, 0.004, 0.088, 0.336, 0, 0, 0.218, 0, 0, 0.308, 0.292, 0.158, 0.084, 0.092, 0.654, 0, 0.002, 0.002, 0.104, 0.022, 0.188, 0.284, 0.368, 0.032, 0, 0.602, 0.234, 0.614, 0.054, 0.158, 0.006, 0, 0.018, 0.006, 0.724, 0.03, 0, 0, 0.078, 0.258, 0.064, 0, 0.006, 0, 0.026, 0.708, 0.012, 0.024, 0, 0.074, 0, 0.088, 0.072, 0.468, 0.01, 0.302, 0.03, 0, 0.034, 0, 0, 0.056, 0.002, 0.208, 0.012, 0.028, 0.002, 0.312, 0, 0.148, 0, 0.102, 0, 0.102, 0.036, 0.008, 0.402, 0.046, 0.008, 0.066, 0, 0, 0, 0, 0.152, 0, 0.094, 0.102, 0, 0.018, 0.05, 0, 0, 0.05, 0.014, 0.004, 0.006, 0, 0.002, 0.072, 0.036, 0.062, 0.284, 0.052, 0.002, 0.006, 0, 0.668, 0.132, 0, 0, 0.45, 0.012, 0.076, 0, 0, 0, 0, 0.07, 0, 0.008, 0.016, 0.03, 0.034, 0.46, 0.014, 0, 0.06, 0.668, 0.036, 0.008, 0.002, 0.006, 0, 0.012, 0.034, 0.182, 0.132, 0.004, 0.118, 0, 0.016, 0.43, 0.002, 0.232, 0, 0.012, 0.0769, 0, 0.9, 0, 0.004, 0.084, 0.004, 0.038, 0.444, 0, 0, 0.054, 0.016, 0.046, 0, 0.002, 0.022, 0.276, 0.192, 0.576, 0, 0.006, 0, 0.022, 0.01, 0.02, 0.004, 0.028, 0.17, 0.068, 0, 0.01, 0.326, 0.026, 0.04, 0, 0.024, 0.594, 0.03, 0, 0.054, 0.188, 0.002, 0, 0, 0, 0.07, 0.064, 0.124, 0.016, 0, 0.02, 0, 0.024, 0.07, 0.012, 0, 0.276, 0.002, 0, 0, 0.03, 0, 0.008, 0.092, 0.002, 0.074, 0.038, 0.166, 0.034, 0, 0, 0, 0, 0, 0.132, 0.032, 0.03, 0.014, 0, 0, 0.068, 0.09, 0.002, 0.01, 0.016, 0, 0.016, 0.022, 0, 0.026, 0, 0.686, 0.066, 0, 0.11, 0, 0.02, 0.002, 0.016, 0, 0, 0.008, 0.59, 0.006, 0.002, 0, 0.02, 0, 0.112)
rpData <- c(rpData, 0.176, 0.076, 0.048, 0, 0.01, 0.002, 0.108, 0.052, 0, 0.306, 0.392, 0.052, 0.01, 0.004, 0, 0.07, 0.004, 0.002, 0.314, 0.002, 0.006, 0.022, 0.18, 0, 0.016, 0, 0, 0.286, 0.012, 0.052, 0.002, 0.068, 0.05, 0, 0.64, 0.214, 0, 0.018, 0, 0, 0.13, 0, 0.056, 0.006, 0.232, 0.002, 0.004, 0.02, 0, 0.032, 0.06, 0.018, 0.002, 0.022, 0.15, 0.016, 0.322, 0.01, 0.018, 0, 0, 0.012, 0.05, 0.018, 0, 0.012, 0, 0.098, 0.102, 0, 0.034, 0.04, 0, 0.006, 0.002, 0.164, 0, 0.002, 0, 0.004, 0.002, 0, 0, 0.046, 0.314, 0, 0, 0.192, 0.036, 0.454, 0.02, 0.03, 0.074, 0.038, 0.086, 0.004, 0.01, 0.158, 0.002, 0.01, 0, 0.004, 0.048, 0.036, 0, 0, 0, 0.006, 0.006, 0.004, 0.158, 0, 0.006, 0.086, 0.024, 0.004, 0.082, 0, 0.072, 0, 0, 0.638, 0.13, 0.056, 0.3, 0.04, 0, 0, 0, 0.09, 0, 0.002, 0.014, 0.068, 0, 0.378, 0, 0.514, 0.008, 0.01, 0.272, 0.002, 0.002, 0.078, 0.524, 0.018, 0.004, 0.19, 0.08, 0, 0.292, 0.044, 0, 0, 0.172, 0.034, 0.288, 0.36, 0.106, 0.064, 0.664, 0.058, 0.06, 0.1, 0.036, 0.004, 0, 0.024, 0.136, 0, 0.218, 0, 0.692, 0.022, 0, 0, 0.244, 0.012, 0, 0, 0.002, 0.554, 0, 0.012, 0.008, 0, 0.16, 0.054, 0, 0.186, 0.006, 0.386, 0, 0, 0.04, 0, 0.11, 0, 0, 0.038, 0, 0, 0, 0, 0.008, 0.002, 0.086, 0.046, 0.026, 0.008, 0.188, 0.004, 0, 0.006, 0.004, 0.006, 0, 0.112, 0, 0.182, 0, 0.038, 0.004, 0.228, 0.428, 0.014, 0.176, 0.326, 0.02, 0.07, 0.002, 0.348, 0.038, 0, 0.066, 0.01, 0, 0, 0.414, 0.004, 0.366, 0.008, 0.082, 0.654, 0.008, 0.058, 0.81, 0.032, 0.074, 0, 0, 0, 0, 0.004, 0.554, 0.016, 0, 0, 0.352, 0.286, 0.024, 0.052, 0.01, 0, 0, 0.07, 0.07, 0, 0.032, 0.002, 0.006, 0.004, 0, 0.004, 0, 0.046, 0, 0.028, 0.09, 0.014, 0.142, 0.04, 0.026, 0.004, 0.084, 0.002, 0.53, 0, 0.054, 0.024, 0.066, 0.006, 0.046, 0, 0.128, 0, 0.03, 0.564, 0.038, 0.246, 0, 0, 0, 0.062, 0.006, 0.014, 0, 0.016, 0.01, 0, 0.882, 0, 0.592, 0.118, 0, 0.314, 0.2, 0.096, 0.33, 0.15, 0, 0, 0.026, 0.012, 0.028, 0, 0.348, 0.012, 0.092, 0.002, 0.008, 0.032, 0.008, 0.338, 0, 0, 0.166, 0, 0, 0, 0.05, 0.094, 0.03, 0.008, 0.002, 0.004, 0.006, 0.144, 0.11, 0.074, 0.176, 0.028, 0.022, 0.03, 0.138, 0, 0.062, 0.002, 0.054, 0.026, 0, 0.142, 0.07, 0.122, 0, 0.01, 0.02, 0.082, 0, 0.078, 0.164, 0.106, 0.228, 0.584, 0.414, 0.002, 0.114, 0.008, 0.014, 0.24, 0.058, 0.198, 0.004, 0, 0, 0, 0.186, 0, 0.054)
rp <- matrix(rpData, ncol=13)
dimnames(rp) <- list(NULL, nameData)
str(rp)
## num [1:499, 1:13] 0.0769 0 0 0 0 0.01 0.012 0.326 0 0.006 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:13] "Convertible Arbitrage" "CTA Global" "Distressed Securities" "Emerging Markets" ...
# Create a portfolio specification object using asset_names
port_spec <- portfolio.spec(assets=asset_names)
port_spec <- add.constraint(portfolio = port_spec, type = "weight_sum", min_sum = 0.99, max_sum = 1.01)
port_spec <- add.constraint(portfolio = port_spec, type = "long_only")
port_spec <- add.objective(portfolio = port_spec, type = "return", name = "mean")
port_spec <- add.objective(portfolio = port_spec, type = "risk", name = "StdDev")
port_spec <- add.objective(portfolio = port_spec, type = "risk_budget", name = "StdDev", min_prisk = 0.05, max_prisk = 0.1)
# Run a single period optimization using random portfolios as the optimization method
opt <- optimize.portfolio(R = asset_returns, portfolio = port_spec, optimize_method = "random", rp = rp, trace = TRUE)
# Print the output of the single-period optimization
print(opt)
## ***********************************
## PortfolioAnalytics Optimization
## ***********************************
##
## Call:
## optimize.portfolio(R = asset_returns, portfolio = port_spec,
## optimize_method = "random", trace = TRUE, rp = rp)
##
## Optimal Weights:
## Convertible Arbitrage CTA Global Distressed Securities
## 0.0769 0.0769 0.0769
## Emerging Markets Equity Market Neutral Event Driven
## 0.0769 0.0769 0.0769
## Fixed Income Arbitrage Global Macro Long/Short Equity
## 0.0769 0.0769 0.0769
## Merger Arbitrage Relative Value Short Selling
## 0.0769 0.0769 0.0769
## Funds of Funds
## 0.0769
##
## Objective Measures:
## mean
## 0.00661
##
##
## StdDev
## 0.01127
##
## contribution :
## Convertible Arbitrage CTA Global Distressed Securities
## 0.0011748 0.0005275 0.0011596
## Emerging Markets Equity Market Neutral Event Driven
## 0.0023359 0.0004921 0.0011980
## Fixed Income Arbitrage Global Macro Long/Short Equity
## 0.0008087 0.0010365 0.0013267
## Merger Arbitrage Relative Value Short Selling
## 0.0005854 0.0008765 -0.0014767
## Funds of Funds
## 0.0012263
##
## pct_contrib_StdDev :
## Convertible Arbitrage CTA Global Distressed Securities
## 0.10423 0.04680 0.10288
## Emerging Markets Equity Market Neutral Event Driven
## 0.20724 0.04366 0.10629
## Fixed Income Arbitrage Global Macro Long/Short Equity
## 0.07175 0.09196 0.11771
## Merger Arbitrage Relative Value Short Selling
## 0.05194 0.07776 -0.13102
## Funds of Funds
## 0.10880
# Running the optimization with periodic rebalancing and analyzing the out-of-sample results of the backtest is an important step to better understand and potentially refine the constraints and objectives
# optimize.portfolio.rebalancing() supports optimization with periodic rebalancing (backtesting) to examine out of sample performance
# In addition to the arguments for optimize.portfolio(), a periodic rebalancing frequency must be specified with rebalance_on, training_period to specify the number of periods to use as the training data for the initial optimization, and rolling_window to specify the number of periods for the window width of the optimization
# If rolling_window is set to NULL each optimization will use all data available at the given period the optimization is run.
# To reduce computation time for this exercise, the set of random portfolios, rp, is generated using 50 permutations, and search_size, how many portfolios to test, is set to 1000
# If you are actually optimizing portfolios yourself, you'll probably want to test more portfolios (the default value for search_size is 20,000)!
newRPData <- c(0.07692, 0.018, 0, 0.044, 0, 0.308, 0.362, 0, 0.014, 0, 0.268, 0, 0.156, 0, 0.004, 0.01, 0, 0.182, 0.004, 0.076, 0.142, 0.456, 0.018, 0.02, 0, 0.034, 0.12, 0.004, 0.046, 0.01, 0.002, 0.006, 0, 0.002, 0.34, 0.022, 0.052, 0.002, 0.088, 0, 0, 0, 0.436, 0, 0.002, 0, 0.016, 0.002, 0.166, 0.07692, 0.17, 0, 0, 0.002, 0.018, 0, 0.298, 0, 0.088, 0.066, 0.006, 0.238, 0, 0.004, 0.008, 0.12, 0.038, 0.08, 0.004, 0.03, 0.004, 0.342, 0.278, 0.118, 0.306, 0, 0.436, 0.042, 0.058, 0, 0.086, 0.02, 0.008, 0.022, 0.006, 0.192, 0.004, 0.308, 0.004, 0, 0.066, 0.066, 0, 0.004, 0, 0.004, 0, 0.068, 0.07692, 0.186, 0, 0, 0.176, 0.002, 0.006, 0.19, 0.038, 0, 0.016, 0, 0.074, 0, 0.024, 0.69, 0, 0.238, 0.136, 0.062, 0, 0.026, 0, 0.018, 0, 0.06, 0.084, 0.002, 0.036, 0.05, 0.018, 0.344, 0.046, 0.076, 0.38, 0.002, 0.114, 0.396, 0, 0, 0, 0, 0.166, 0, 0, 0, 0.026, 0.046, 0.11, 0.07692, 0.212, 0.074, 0.062, 0.096, 0.058, 0, 0.058, 0.012, 0.528, 0, 0.244, 0.044, 0.592, 0.022, 0, 0.002, 0.194, 0.002, 0, 0.076, 0, 0.016, 0, 0.404, 0.068, 0.206, 0.016, 0.1, 0.012, 0, 0.006, 0.006, 0.026, 0.024, 0.02, 0.284, 0, 0, 0.13, 0, 0.016, 0, 0.002, 0.002, 0, 0, 0.01, 0.002, 0.07692, 0.03, 0.572, 0.176, 0.024, 0.006, 0.004, 0, 0.138, 0, 0.01, 0.186, 0.136, 0, 0.158, 0.014, 0.004, 0.134, 0, 0.014, 0.02, 0.166, 0.072, 0, 0.014, 0.004, 0.088, 0.028, 0.02, 0.144, 0, 0.09, 0.67, 0.102, 0.06, 0.014, 0.242, 0.012, 0.174, 0.234, 0, 0.458, 0.058, 0.432, 0, 0.102, 0.028, 0.004, 0, 0.07692, 0.194, 0, 0.014, 0, 0.018, 0.118, 0.04, 0, 0, 0.046, 0.072, 0, 0, 0.264, 0.104, 0.756, 0, 0.026, 0.006, 0.394, 0.026, 0, 0.056, 0.212, 0.13, 0.052, 0.094, 0.328, 0.134, 0.114, 0.21, 0.05, 0.006, 0, 0.012, 0, 0.266, 0.014, 0, 0.402, 0.024, 0.166, 0.016, 0.804, 0.024, 0.1, 0.002, 0.058, 0.07692, 0.04, 0.162, 0, 0.008, 0.016, 0.028, 0, 0.01, 0.334, 0.16, 0.006, 0.082, 0.02, 0.038, 0, 0.006, 0.018, 0.264, 0, 0.224, 0.028, 0.548, 0.034, 0.034, 0.012, 0.082, 0.29, 0.216, 0.144, 0.312, 0.012, 0, 0.294, 0.148, 0.014, 0, 0.038, 0.03, 0, 0, 0.004, 0.026, 0.022, 0, 0, 0.532, 0.078, 0.064, 0.07692, 0.022, 0.026, 0.002, 0, 0.232, 0.002, 0.33, 0.152, 0.024, 0, 0.112, 0.138, 0, 0.06, 0.046, 0.006, 0.04, 0.016, 0.236, 0.042, 0, 0, 0.584, 0.012, 0.008, 0.052, 0, 0.008, 0.018, 0.114, 0.042, 0.08, 0.002, 0.002, 0.098, 0, 0.082, 0.22, 0.156, 0.004, 0, 0.004, 0, 0.058, 0.846, 0.062, 0.574, 0.332, 0.07692, 0.032, 0.134, 0.024, 0.064, 0.054, 0, 0.04, 0.516, 0, 0.374, 0.044, 0.02, 0, 0.172, 0, 0.002, 0, 0.016, 0.012, 0.022, 0.004, 0, 0, 0.04, 0.078, 0.042, 0.014, 0.006, 0.034, 0.002, 0, 0.022, 0.294, 0.006, 0.248, 0, 0.002, 0.004, 0.034, 0.004, 0.016, 0.044, 0.354, 0.016, 0.01, 0.018, 0, 0.084, 0.07692, 0.016, 0, 0.68, 0.068, 0.1, 0.376, 0.006, 0.018, 0.012, 0.002, 0.002, 0.106, 0.306, 0.012, 0.016, 0, 0.01, 0.012, 0.004, 0, 0.072, 0, 0, 0.014, 0.006, 0.106, 0.006, 0.04, 0.222, 0, 0.034, 0, 0.122, 0, 0.038, 0.004, 0.194, 0.04, 0.408, 0.594, 0.166, 0, 0.016, 0, 0, 0.042, 0.016, 0.002, 0.07692, 0.01, 0, 0, 0, 0.018, 0.03, 0.016, 0.01, 0, 0.008, 0.022, 0, 0, 0.194, 0.068, 0.032, 0.024, 0.406, 0.548, 0.04, 0.004, 0, 0, 0.118, 0.114, 0, 0.036, 0.132, 0.006, 0.004, 0.16, 0.012, 0, 0.02, 0.38, 0.112, 0, 0.086, 0.024, 0, 0.148, 0.026, 0.038, 0, 0.024, 0.014, 0.06, 0.1, 0.07692, 0.038, 0.026, 0, 0, 0.178, 0.01, 0.002, 0.084, 0, 0.03, 0.282, 0.006, 0.09, 0.008, 0.028, 0.004, 0, 0.01, 0.008, 0.008, 0.21, 0.008, 0, 0.032, 0, 0.174, 0.036, 0.008, 0.016, 0.416, 0.008, 0.09, 0.064, 0.002, 0.116, 0, 0, 0, 0, 0, 0.002, 0.002, 0.002, 0, 0, 0.152, 0.038, 0, 0.07692, 0.034, 0.01, 0.004, 0.566, 0, 0.062, 0.016, 0, 0.006, 0.012, 0.024, 0, 0, 0.036, 0.006, 0.07, 0.12, 0.038, 0.022, 0.01, 0, 0, 0.004, 0.004, 0.184, 0, 0.046, 0.024, 0.154, 0.024, 0, 0.014, 0, 0, 0.038, 0, 0, 0.026, 0, 0, 0.098, 0, 0.11, 0.124, 0.004, 0, 0.172, 0.022)
rp <- matrix(newRPData, ncol=13)
rp <- t(apply(rp, 1, FUN=function(x) { x/sum(x) }))
dimnames(rp) <- list(NULL, nameData)
str(rp)
## num [1:49, 1:13] 0.0769 0.018 0 0.0437 0 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:13] "Convertible Arbitrage" "CTA Global" "Distressed Securities" "Emerging Markets" ...
# Run the optimization backtest with quarterly rebalancing
opt_rebal <- optimize.portfolio.rebalancing(R = asset_returns, portfolio = port_spec, optimize_method = "random", rp = rp, trace = TRUE, search_size = 1000, rebalance_on = "quarters", training_period = 60, rolling_window = 60)
# Print the output of the optimization backtest
print(opt_rebal)
## **************************************************
## PortfolioAnalytics Optimization with Rebalancing
## **************************************************
##
## Call:
## optimize.portfolio.rebalancing(R = asset_returns, portfolio = port_spec,
## optimize_method = "random", search_size = 1000, trace = TRUE,
## rp = rp, rebalance_on = "quarters", training_period = 60,
## rolling_window = 60)
##
## Number of rebalancing dates: 32
## First rebalance date:
## [1] "2001-12-31"
## Last rebalance date:
## [1] "2009-08-31"
##
## Annualized Portfolio Rebalancing Return:
## [1] 0.0632
##
## Annualized Portfolio Standard Deviation:
## [1] 0.0424
# Extract the objective measures for the single period optimization
extractObjectiveMeasures(opt)
## $mean
## mean
## 0.00661
##
## $StdDev
## $StdDev$StdDev
## [1] 0.0113
##
## $StdDev$contribution
## Convertible Arbitrage CTA Global Distressed Securities
## 0.001175 0.000527 0.001160
## Emerging Markets Equity Market Neutral Event Driven
## 0.002336 0.000492 0.001198
## Fixed Income Arbitrage Global Macro Long/Short Equity
## 0.000809 0.001036 0.001327
## Merger Arbitrage Relative Value Short Selling
## 0.000585 0.000876 -0.001477
## Funds of Funds
## 0.001226
##
## $StdDev$pct_contrib_StdDev
## Convertible Arbitrage CTA Global Distressed Securities
## 0.1042 0.0468 0.1029
## Emerging Markets Equity Market Neutral Event Driven
## 0.2072 0.0437 0.1063
## Fixed Income Arbitrage Global Macro Long/Short Equity
## 0.0718 0.0920 0.1177
## Merger Arbitrage Relative Value Short Selling
## 0.0519 0.0778 -0.1310
## Funds of Funds
## 0.1088
# Extract the objective measures for the optimization backtest
extractObjectiveMeasures(opt_rebal)
## mean StdDev StdDev.contribution.Convertible Arbitrage
## 2001-12-31 0.00892 0.00994 0.000531
## 2002-03-31 0.00834 0.00960 0.000554
## 2002-06-30 0.00824 0.00961 0.000558
## 2002-09-30 0.00745 0.00913 0.000784
## 2002-12-31 0.00735 0.00926 0.000770
## 2003-03-31 0.00717 0.00904 0.000606
## 2003-06-30 0.00750 0.00988 0.000855
## 2003-09-30 0.00773 0.00775 0.000492
## 2003-12-31 0.00841 0.00696 0.000315
## 2004-03-31 0.00846 0.00704 0.000315
## 2004-06-30 0.00751 0.00715 0.000357
## 2004-09-30 0.00719 0.00702 0.000370
## 2004-12-31 0.00725 0.00703 0.000356
## 2005-03-31 0.00642 0.00638 0.000336
## 2005-06-30 0.00634 0.00668 0.000485
## 2005-09-30 0.00649 0.00679 0.000489
## 2005-12-31 0.00643 0.00701 0.000497
## 2006-03-31 0.00636 0.00701 0.000496
## 2006-06-30 0.00631 0.00725 0.000493
## 2006-09-30 0.00612 0.00727 0.000478
## 2006-12-31 0.00659 0.00722 0.000501
## 2007-03-31 0.00671 0.00718 0.000497
## 2007-06-30 0.00693 0.00728 0.000493
## 2007-09-30 0.00711 0.00761 0.000509
## 2007-12-31 0.00731 0.00792 0.000582
## 2008-03-31 0.00653 0.00869 0.000604
## 2008-06-30 0.00637 0.00842 0.000609
## 2008-09-30 0.00498 0.01127 0.001128
## 2008-12-31 0.00352 0.01308 0.001573
## 2009-03-31 0.00304 0.01288 0.001599
## 2009-06-30 0.00414 0.01359 0.001756
## 2009-08-31 0.00461 0.01373 0.001831
## StdDev.contribution.CTA Global
## 2001-12-31 3.49e-04
## 2002-03-31 2.76e-04
## 2002-06-30 1.95e-04
## 2002-09-30 2.40e-03
## 2002-12-31 2.80e-03
## 2003-03-31 9.28e-05
## 2003-06-30 3.56e-03
## 2003-09-30 5.83e-04
## 2003-12-31 7.91e-04
## 2004-03-31 8.21e-04
## 2004-06-30 1.03e-03
## 2004-09-30 1.03e-03
## 2004-12-31 1.08e-03
## 2005-03-31 1.24e-03
## 2005-06-30 1.27e-03
## 2005-09-30 1.25e-03
## 2005-12-31 1.23e-03
## 2006-03-31 1.19e-03
## 2006-06-30 1.21e-03
## 2006-09-30 1.23e-03
## 2006-12-31 1.21e-03
## 2007-03-31 1.19e-03
## 2007-06-30 1.22e-03
## 2007-09-30 1.51e-03
## 2007-12-31 1.46e-03
## 2008-03-31 1.29e-03
## 2008-06-30 1.16e-03
## 2008-09-30 1.01e-03
## 2008-12-31 7.05e-04
## 2009-03-31 6.10e-04
## 2009-06-30 5.36e-04
## 2009-08-31 5.04e-04
## StdDev.contribution.Distressed Securities
## 2001-12-31 0.001088
## 2002-03-31 0.001098
## 2002-06-30 0.001101
## 2002-09-30 0.000806
## 2002-12-31 0.000768
## 2003-03-31 0.001098
## 2003-06-30 0.000760
## 2003-09-30 0.000751
## 2003-12-31 0.000701
## 2004-03-31 0.000724
## 2004-06-30 0.000632
## 2004-09-30 0.000610
## 2004-12-31 0.000638
## 2005-03-31 0.000561
## 2005-06-30 0.000556
## 2005-09-30 0.000549
## 2005-12-31 0.000572
## 2006-03-31 0.000597
## 2006-06-30 0.000596
## 2006-09-30 0.000615
## 2006-12-31 0.000615
## 2007-03-31 0.000603
## 2007-06-30 0.000602
## 2007-09-30 0.000569
## 2007-12-31 0.000596
## 2008-03-31 0.000671
## 2008-06-30 0.000646
## 2008-09-30 0.000935
## 2008-12-31 0.001279
## 2009-03-31 0.001255
## 2009-06-30 0.001382
## 2009-08-31 0.001420
## StdDev.contribution.Emerging Markets
## 2001-12-31 0.002784
## 2002-03-31 0.002594
## 2002-06-30 0.002607
## 2002-09-30 0.001170
## 2002-12-31 0.001063
## 2003-03-31 0.002354
## 2003-06-30 0.000988
## 2003-09-30 0.001600
## 2003-12-31 0.001659
## 2004-03-31 0.001663
## 2004-06-30 0.001624
## 2004-09-30 0.001529
## 2004-12-31 0.001477
## 2005-03-31 0.001120
## 2005-06-30 0.001017
## 2005-09-30 0.001038
## 2005-12-31 0.001116
## 2006-03-31 0.001126
## 2006-06-30 0.001247
## 2006-09-30 0.001237
## 2006-12-31 0.001224
## 2007-03-31 0.001216
## 2007-06-30 0.001239
## 2007-09-30 0.001222
## 2007-12-31 0.001319
## 2008-03-31 0.001585
## 2008-06-30 0.001553
## 2008-09-30 0.001989
## 2008-12-31 0.002444
## 2009-03-31 0.002397
## 2009-06-30 0.002563
## 2009-08-31 0.002587
## StdDev.contribution.Equity Market Neutral
## 2001-12-31 0.000358
## 2002-03-31 0.000355
## 2002-06-30 0.000353
## 2002-09-30 0.000543
## 2002-12-31 0.000531
## 2003-03-31 0.000316
## 2003-06-30 0.000489
## 2003-09-30 0.000248
## 2003-12-31 0.000253
## 2004-03-31 0.000250
## 2004-06-30 0.000295
## 2004-09-30 0.000293
## 2004-12-31 0.000292
## 2005-03-31 0.000217
## 2005-06-30 0.000252
## 2005-09-30 0.000239
## 2005-12-31 0.000245
## 2006-03-31 0.000241
## 2006-06-30 0.000252
## 2006-09-30 0.000251
## 2006-12-31 0.000260
## 2007-03-31 0.000257
## 2007-06-30 0.000266
## 2007-09-30 0.000298
## 2007-12-31 0.000321
## 2008-03-31 0.000369
## 2008-06-30 0.000366
## 2008-09-30 0.000537
## 2008-12-31 0.000623
## 2009-03-31 0.000624
## 2009-06-30 0.000613
## 2009-08-31 0.000611
## StdDev.contribution.Event Driven
## 2001-12-31 0.001146
## 2002-03-31 0.001158
## 2002-06-30 0.001170
## 2002-09-30 0.000000
## 2002-12-31 0.000000
## 2003-03-31 0.001136
## 2003-06-30 0.000000
## 2003-09-30 0.000773
## 2003-12-31 0.000803
## 2004-03-31 0.000817
## 2004-06-30 0.000733
## 2004-09-30 0.000711
## 2004-12-31 0.000720
## 2005-03-31 0.000660
## 2005-06-30 0.000673
## 2005-09-30 0.000665
## 2005-12-31 0.000715
## 2006-03-31 0.000742
## 2006-06-30 0.000752
## 2006-09-30 0.000757
## 2006-12-31 0.000732
## 2007-03-31 0.000730
## 2007-06-30 0.000747
## 2007-09-30 0.000671
## 2007-12-31 0.000722
## 2008-03-31 0.000838
## 2008-06-30 0.000820
## 2008-09-30 0.001106
## 2008-12-31 0.001332
## 2009-03-31 0.001320
## 2009-06-30 0.001406
## 2009-08-31 0.001432
## StdDev.contribution.Fixed Income Arbitrage
## 2001-12-31 0.000656
## 2002-03-31 0.000626
## 2002-06-30 0.000625
## 2002-09-30 0.000523
## 2002-12-31 0.000535
## 2003-03-31 0.000638
## 2003-06-30 0.000550
## 2003-09-30 0.000665
## 2003-12-31 0.000271
## 2004-03-31 0.000278
## 2004-06-30 0.000256
## 2004-09-30 0.000251
## 2004-12-31 0.000245
## 2005-03-31 0.000255
## 2005-06-30 0.000255
## 2005-09-30 0.000250
## 2005-12-31 0.000235
## 2006-03-31 0.000221
## 2006-06-30 0.000232
## 2006-09-30 0.000230
## 2006-12-31 0.000208
## 2007-03-31 0.000210
## 2007-06-30 0.000205
## 2007-09-30 0.000255
## 2007-12-31 0.000268
## 2008-03-31 0.000349
## 2008-06-30 0.000326
## 2008-09-30 0.000575
## 2008-12-31 0.000975
## 2009-03-31 0.000968
## 2009-06-30 0.001062
## 2009-08-31 0.001105
## StdDev.contribution.Global Macro
## 2001-12-31 0.001350
## 2002-03-31 0.001267
## 2002-06-30 0.001270
## 2002-09-30 0.002151
## 2002-12-31 0.002095
## 2003-03-31 0.001103
## 2003-06-30 0.002030
## 2003-09-30 0.001047
## 2003-12-31 0.001024
## 2004-03-31 0.001013
## 2004-06-30 0.001048
## 2004-09-30 0.001046
## 2004-12-31 0.001027
## 2005-03-31 0.000858
## 2005-06-30 0.000807
## 2005-09-30 0.000815
## 2005-12-31 0.000826
## 2006-03-31 0.000773
## 2006-06-30 0.000831
## 2006-09-30 0.000848
## 2006-12-31 0.000835
## 2007-03-31 0.000824
## 2007-06-30 0.000836
## 2007-09-30 0.000866
## 2007-12-31 0.000885
## 2008-03-31 0.000917
## 2008-06-30 0.000851
## 2008-09-30 0.000956
## 2008-12-31 0.000912
## 2009-03-31 0.000882
## 2009-06-30 0.000906
## 2009-08-31 0.000906
## StdDev.contribution.Long/Short Equity
## 2001-12-31 0.001172
## 2002-03-31 0.001191
## 2002-06-30 0.001198
## 2002-09-30 0.000283
## 2002-12-31 0.000244
## 2003-03-31 0.001087
## 2003-06-30 0.000229
## 2003-09-30 0.000909
## 2003-12-31 0.001083
## 2004-03-31 0.001077
## 2004-06-30 0.001062
## 2004-09-30 0.001055
## 2004-12-31 0.001032
## 2005-03-31 0.000715
## 2005-06-30 0.000717
## 2005-09-30 0.000723
## 2005-12-31 0.000784
## 2006-03-31 0.000842
## 2006-06-30 0.000933
## 2006-09-30 0.000938
## 2006-12-31 0.000920
## 2007-03-31 0.000911
## 2007-06-30 0.000938
## 2007-09-30 0.000868
## 2007-12-31 0.000944
## 2008-03-31 0.001136
## 2008-06-30 0.001111
## 2008-09-30 0.001392
## 2008-12-31 0.001545
## 2009-03-31 0.001531
## 2009-06-30 0.001619
## 2009-08-31 0.001635
## StdDev.contribution.Merger Arbitrage
## 2001-12-31 0.000596
## 2002-03-31 0.000619
## 2002-06-30 0.000633
## 2002-09-30 0.000638
## 2002-12-31 0.000609
## 2003-03-31 0.000631
## 2003-06-30 0.000571
## 2003-09-30 0.000379
## 2003-12-31 0.000450
## 2004-03-31 0.000443
## 2004-06-30 0.000429
## 2004-09-30 0.000419
## 2004-12-31 0.000409
## 2005-03-31 0.000378
## 2005-06-30 0.000429
## 2005-09-30 0.000413
## 2005-12-31 0.000453
## 2006-03-31 0.000496
## 2006-06-30 0.000489
## 2006-09-30 0.000481
## 2006-12-31 0.000452
## 2007-03-31 0.000460
## 2007-06-30 0.000479
## 2007-09-30 0.000415
## 2007-12-31 0.000470
## 2008-03-31 0.000517
## 2008-06-30 0.000518
## 2008-09-30 0.000596
## 2008-12-31 0.000653
## 2009-03-31 0.000655
## 2009-06-30 0.000631
## 2009-08-31 0.000624
## StdDev.contribution.Relative Value StdDev.contribution.Short Selling
## 2001-12-31 0.000586 -0.001993
## 2002-03-31 0.000584 -0.002021
## 2002-06-30 0.000597 -0.001987
## 2002-09-30 0.000000 -0.000177
## 2002-12-31 0.000000 -0.000150
## 2003-03-31 0.000619 -0.001835
## 2003-06-30 0.000000 -0.000153
## 2003-09-30 0.000499 -0.001137
## 2003-12-31 0.000440 -0.001805
## 2004-03-31 0.000444 -0.001778
## 2004-06-30 0.000449 -0.001681
## 2004-09-30 0.000450 -0.001656
## 2004-12-31 0.000451 -0.001567
## 2005-03-31 0.000428 -0.000954
## 2005-06-30 0.000475 -0.000812
## 2005-09-30 0.000473 -0.000655
## 2005-12-31 0.000486 -0.000753
## 2006-03-31 0.000479 -0.000817
## 2006-06-30 0.000506 -0.000971
## 2006-09-30 0.000511 -0.000983
## 2006-12-31 0.000487 -0.000915
## 2007-03-31 0.000485 -0.000897
## 2007-06-30 0.000501 -0.000954
## 2007-09-30 0.000469 -0.000784
## 2007-12-31 0.000517 -0.000948
## 2008-03-31 0.000601 -0.001116
## 2008-06-30 0.000592 -0.001053
## 2008-09-30 0.000849 -0.001005
## 2008-12-31 0.001113 -0.001446
## 2009-03-31 0.001109 -0.001425
## 2009-06-30 0.001191 -0.001447
## 2009-08-31 0.001214 -0.001513
## StdDev.contribution.Funds of Funds
## 2001-12-31 0.001316
## 2002-03-31 0.001297
## 2002-06-30 0.001290
## 2002-09-30 0.000000
## 2002-12-31 0.000000
## 2003-03-31 0.001194
## 2003-06-30 0.000000
## 2003-09-30 0.000939
## 2003-12-31 0.000976
## 2004-03-31 0.000977
## 2004-06-30 0.000927
## 2004-09-30 0.000911
## 2004-12-31 0.000873
## 2005-03-31 0.000567
## 2005-06-30 0.000558
## 2005-09-30 0.000549
## 2005-12-31 0.000603
## 2006-03-31 0.000629
## 2006-06-30 0.000680
## 2006-09-30 0.000683
## 2006-12-31 0.000688
## 2007-03-31 0.000689
## 2007-06-30 0.000710
## 2007-09-30 0.000734
## 2007-12-31 0.000787
## 2008-03-31 0.000923
## 2008-06-30 0.000924
## 2008-09-30 0.001201
## 2008-12-31 0.001369
## 2009-03-31 0.001356
## 2009-06-30 0.001370
## 2009-08-31 0.001376
## StdDev.pct_contrib_StdDev.Convertible Arbitrage
## 2001-12-31 0.0534
## 2002-03-31 0.0577
## 2002-06-30 0.0580
## 2002-09-30 0.0859
## 2002-12-31 0.0832
## 2003-03-31 0.0670
## 2003-06-30 0.0865
## 2003-09-30 0.0635
## 2003-12-31 0.0452
## 2004-03-31 0.0448
## 2004-06-30 0.0499
## 2004-09-30 0.0527
## 2004-12-31 0.0507
## 2005-03-31 0.0526
## 2005-06-30 0.0726
## 2005-09-30 0.0719
## 2005-12-31 0.0709
## 2006-03-31 0.0707
## 2006-06-30 0.0680
## 2006-09-30 0.0657
## 2006-12-31 0.0694
## 2007-03-31 0.0693
## 2007-06-30 0.0677
## 2007-09-30 0.0669
## 2007-12-31 0.0735
## 2008-03-31 0.0695
## 2008-06-30 0.0723
## 2008-09-30 0.1001
## 2008-12-31 0.1203
## 2009-03-31 0.1242
## 2009-06-30 0.1292
## 2009-08-31 0.1334
## StdDev.pct_contrib_StdDev.CTA Global
## 2001-12-31 0.0351
## 2002-03-31 0.0288
## 2002-06-30 0.0203
## 2002-09-30 0.2635
## 2002-12-31 0.3020
## 2003-03-31 0.0103
## 2003-06-30 0.3605
## 2003-09-30 0.0752
## 2003-12-31 0.1136
## 2004-03-31 0.1166
## 2004-06-30 0.1434
## 2004-09-30 0.1469
## 2004-12-31 0.1535
## 2005-03-31 0.1947
## 2005-06-30 0.1900
## 2005-09-30 0.1836
## 2005-12-31 0.1757
## 2006-03-31 0.1692
## 2006-06-30 0.1675
## 2006-09-30 0.1687
## 2006-12-31 0.1678
## 2007-03-31 0.1658
## 2007-06-30 0.1674
## 2007-09-30 0.1990
## 2007-12-31 0.1838
## 2008-03-31 0.1486
## 2008-06-30 0.1379
## 2008-09-30 0.0894
## 2008-12-31 0.0539
## 2009-03-31 0.0474
## 2009-06-30 0.0395
## 2009-08-31 0.0367
## StdDev.pct_contrib_StdDev.Distressed Securities
## 2001-12-31 0.1095
## 2002-03-31 0.1144
## 2002-06-30 0.1146
## 2002-09-30 0.0883
## 2002-12-31 0.0829
## 2003-03-31 0.1215
## 2003-06-30 0.0769
## 2003-09-30 0.0969
## 2003-12-31 0.1007
## 2004-03-31 0.1027
## 2004-06-30 0.0883
## 2004-09-30 0.0869
## 2004-12-31 0.0907
## 2005-03-31 0.0879
## 2005-06-30 0.0832
## 2005-09-30 0.0809
## 2005-12-31 0.0817
## 2006-03-31 0.0851
## 2006-06-30 0.0821
## 2006-09-30 0.0845
## 2006-12-31 0.0853
## 2007-03-31 0.0840
## 2007-06-30 0.0827
## 2007-09-30 0.0749
## 2007-12-31 0.0752
## 2008-03-31 0.0773
## 2008-06-30 0.0767
## 2008-09-30 0.0830
## 2008-12-31 0.0978
## 2009-03-31 0.0974
## 2009-06-30 0.1017
## 2009-08-31 0.1034
## StdDev.pct_contrib_StdDev.Emerging Markets
## 2001-12-31 0.280
## 2002-03-31 0.270
## 2002-06-30 0.271
## 2002-09-30 0.128
## 2002-12-31 0.115
## 2003-03-31 0.260
## 2003-06-30 0.100
## 2003-09-30 0.206
## 2003-12-31 0.238
## 2004-03-31 0.236
## 2004-06-30 0.227
## 2004-09-30 0.218
## 2004-12-31 0.210
## 2005-03-31 0.175
## 2005-06-30 0.152
## 2005-09-30 0.153
## 2005-12-31 0.159
## 2006-03-31 0.161
## 2006-06-30 0.172
## 2006-09-30 0.170
## 2006-12-31 0.170
## 2007-03-31 0.169
## 2007-06-30 0.170
## 2007-09-30 0.161
## 2007-12-31 0.166
## 2008-03-31 0.182
## 2008-06-30 0.184
## 2008-09-30 0.177
## 2008-12-31 0.187
## 2009-03-31 0.186
## 2009-06-30 0.189
## 2009-08-31 0.188
## StdDev.pct_contrib_StdDev.Equity Market Neutral
## 2001-12-31 0.0360
## 2002-03-31 0.0370
## 2002-06-30 0.0367
## 2002-09-30 0.0595
## 2002-12-31 0.0574
## 2003-03-31 0.0350
## 2003-06-30 0.0495
## 2003-09-30 0.0321
## 2003-12-31 0.0364
## 2004-03-31 0.0355
## 2004-06-30 0.0412
## 2004-09-30 0.0417
## 2004-12-31 0.0416
## 2005-03-31 0.0341
## 2005-06-30 0.0378
## 2005-09-30 0.0351
## 2005-12-31 0.0350
## 2006-03-31 0.0343
## 2006-06-30 0.0347
## 2006-09-30 0.0344
## 2006-12-31 0.0361
## 2007-03-31 0.0358
## 2007-06-30 0.0366
## 2007-09-30 0.0392
## 2007-12-31 0.0405
## 2008-03-31 0.0425
## 2008-06-30 0.0434
## 2008-09-30 0.0477
## 2008-12-31 0.0477
## 2009-03-31 0.0484
## 2009-06-30 0.0451
## 2009-08-31 0.0445
## StdDev.pct_contrib_StdDev.Event Driven
## 2001-12-31 0.1154
## 2002-03-31 0.1207
## 2002-06-30 0.1218
## 2002-09-30 0.0000
## 2002-12-31 0.0000
## 2003-03-31 0.1257
## 2003-06-30 0.0000
## 2003-09-30 0.0998
## 2003-12-31 0.1154
## 2004-03-31 0.1160
## 2004-06-30 0.1024
## 2004-09-30 0.1013
## 2004-12-31 0.1024
## 2005-03-31 0.1034
## 2005-06-30 0.1007
## 2005-09-30 0.0979
## 2005-12-31 0.1020
## 2006-03-31 0.1059
## 2006-06-30 0.1036
## 2006-09-30 0.1041
## 2006-12-31 0.1014
## 2007-03-31 0.1018
## 2007-06-30 0.1026
## 2007-09-30 0.0882
## 2007-12-31 0.0912
## 2008-03-31 0.0965
## 2008-06-30 0.0974
## 2008-09-30 0.0981
## 2008-12-31 0.1019
## 2009-03-31 0.1024
## 2009-06-30 0.1035
## 2009-08-31 0.1043
## StdDev.pct_contrib_StdDev.Fixed Income Arbitrage
## 2001-12-31 0.0660
## 2002-03-31 0.0652
## 2002-06-30 0.0650
## 2002-09-30 0.0574
## 2002-12-31 0.0578
## 2003-03-31 0.0706
## 2003-06-30 0.0557
## 2003-09-30 0.0858
## 2003-12-31 0.0390
## 2004-03-31 0.0395
## 2004-06-30 0.0358
## 2004-09-30 0.0357
## 2004-12-31 0.0348
## 2005-03-31 0.0399
## 2005-06-30 0.0382
## 2005-09-30 0.0367
## 2005-12-31 0.0335
## 2006-03-31 0.0315
## 2006-06-30 0.0319
## 2006-09-30 0.0317
## 2006-12-31 0.0288
## 2007-03-31 0.0293
## 2007-06-30 0.0281
## 2007-09-30 0.0335
## 2007-12-31 0.0338
## 2008-03-31 0.0402
## 2008-06-30 0.0387
## 2008-09-30 0.0511
## 2008-12-31 0.0746
## 2009-03-31 0.0751
## 2009-06-30 0.0781
## 2009-08-31 0.0805
## StdDev.pct_contrib_StdDev.Global Macro
## 2001-12-31 0.1359
## 2002-03-31 0.1321
## 2002-06-30 0.1322
## 2002-09-30 0.2357
## 2002-12-31 0.2262
## 2003-03-31 0.1220
## 2003-06-30 0.2054
## 2003-09-30 0.1351
## 2003-12-31 0.1471
## 2004-03-31 0.1439
## 2004-06-30 0.1464
## 2004-09-30 0.1490
## 2004-12-31 0.1460
## 2005-03-31 0.1344
## 2005-06-30 0.1207
## 2005-09-30 0.1199
## 2005-12-31 0.1178
## 2006-03-31 0.1102
## 2006-06-30 0.1146
## 2006-09-30 0.1166
## 2006-12-31 0.1157
## 2007-03-31 0.1148
## 2007-06-30 0.1148
## 2007-09-30 0.1138
## 2007-12-31 0.1117
## 2008-03-31 0.1056
## 2008-06-30 0.1010
## 2008-09-30 0.0849
## 2008-12-31 0.0697
## 2009-03-31 0.0685
## 2009-06-30 0.0667
## 2009-08-31 0.0659
## StdDev.pct_contrib_StdDev.Long/Short Equity
## 2001-12-31 0.1179
## 2002-03-31 0.1241
## 2002-06-30 0.1247
## 2002-09-30 0.0311
## 2002-12-31 0.0263
## 2003-03-31 0.1202
## 2003-06-30 0.0232
## 2003-09-30 0.1174
## 2003-12-31 0.1556
## 2004-03-31 0.1529
## 2004-06-30 0.1484
## 2004-09-30 0.1503
## 2004-12-31 0.1467
## 2005-03-31 0.1120
## 2005-06-30 0.1073
## 2005-09-30 0.1063
## 2005-12-31 0.1118
## 2006-03-31 0.1201
## 2006-06-30 0.1286
## 2006-09-30 0.1290
## 2006-12-31 0.1274
## 2007-03-31 0.1270
## 2007-06-30 0.1288
## 2007-09-30 0.1141
## 2007-12-31 0.1192
## 2008-03-31 0.1308
## 2008-06-30 0.1319
## 2008-09-30 0.1236
## 2008-12-31 0.1182
## 2009-03-31 0.1188
## 2009-06-30 0.1191
## 2009-08-31 0.1191
## StdDev.pct_contrib_StdDev.Merger Arbitrage
## 2001-12-31 0.0600
## 2002-03-31 0.0645
## 2002-06-30 0.0659
## 2002-09-30 0.0699
## 2002-12-31 0.0657
## 2003-03-31 0.0698
## 2003-06-30 0.0578
## 2003-09-30 0.0489
## 2003-12-31 0.0646
## 2004-03-31 0.0629
## 2004-06-30 0.0600
## 2004-09-30 0.0597
## 2004-12-31 0.0581
## 2005-03-31 0.0593
## 2005-06-30 0.0642
## 2005-09-30 0.0608
## 2005-12-31 0.0646
## 2006-03-31 0.0707
## 2006-06-30 0.0675
## 2006-09-30 0.0662
## 2006-12-31 0.0626
## 2007-03-31 0.0641
## 2007-06-30 0.0658
## 2007-09-30 0.0546
## 2007-12-31 0.0594
## 2008-03-31 0.0595
## 2008-06-30 0.0615
## 2008-09-30 0.0529
## 2008-12-31 0.0500
## 2009-03-31 0.0509
## 2009-06-30 0.0464
## 2009-08-31 0.0455
## StdDev.pct_contrib_StdDev.Relative Value
## 2001-12-31 0.0589
## 2002-03-31 0.0608
## 2002-06-30 0.0621
## 2002-09-30 0.0000
## 2002-12-31 0.0000
## 2003-03-31 0.0685
## 2003-06-30 0.0000
## 2003-09-30 0.0644
## 2003-12-31 0.0631
## 2004-03-31 0.0630
## 2004-06-30 0.0627
## 2004-09-30 0.0640
## 2004-12-31 0.0641
## 2005-03-31 0.0670
## 2005-06-30 0.0711
## 2005-09-30 0.0696
## 2005-12-31 0.0693
## 2006-03-31 0.0684
## 2006-06-30 0.0697
## 2006-09-30 0.0703
## 2006-12-31 0.0675
## 2007-03-31 0.0676
## 2007-06-30 0.0688
## 2007-09-30 0.0616
## 2007-12-31 0.0653
## 2008-03-31 0.0692
## 2008-06-30 0.0703
## 2008-09-30 0.0753
## 2008-12-31 0.0851
## 2009-03-31 0.0861
## 2009-06-30 0.0877
## 2009-08-31 0.0884
## StdDev.pct_contrib_StdDev.Short Selling
## 2001-12-31 -0.2005
## 2002-03-31 -0.2106
## 2002-06-30 -0.2068
## 2002-09-30 -0.0194
## 2002-12-31 -0.0162
## 2003-03-31 -0.2030
## 2003-06-30 -0.0155
## 2003-09-30 -0.1467
## 2003-12-31 -0.2594
## 2004-03-31 -0.2525
## 2004-06-30 -0.2350
## 2004-09-30 -0.2359
## 2004-12-31 -0.2228
## 2005-03-31 -0.1495
## 2005-06-30 -0.1214
## 2005-09-30 -0.0965
## 2005-12-31 -0.1074
## 2006-03-31 -0.1165
## 2006-06-30 -0.1339
## 2006-09-30 -0.1352
## 2006-12-31 -0.1268
## 2007-03-31 -0.1250
## 2007-06-30 -0.1311
## 2007-09-30 -0.1031
## 2007-12-31 -0.1196
## 2008-03-31 -0.1285
## 2008-06-30 -0.1250
## 2008-09-30 -0.0892
## 2008-12-31 -0.1105
## 2009-03-31 -0.1106
## 2009-06-30 -0.1065
## 2009-08-31 -0.1102
## StdDev.pct_contrib_StdDev.Funds of Funds
## 2001-12-31 0.1324
## 2002-03-31 0.1351
## 2002-06-30 0.1342
## 2002-09-30 0.0000
## 2002-12-31 0.0000
## 2003-03-31 0.1321
## 2003-06-30 0.0000
## 2003-09-30 0.1212
## 2003-12-31 0.1403
## 2004-03-31 0.1386
## 2004-06-30 0.1295
## 2004-09-30 0.1298
## 2004-12-31 0.1242
## 2005-03-31 0.0888
## 2005-06-30 0.0835
## 2005-09-30 0.0808
## 2005-12-31 0.0860
## 2006-03-31 0.0897
## 2006-06-30 0.0937
## 2006-09-30 0.0939
## 2006-12-31 0.0953
## 2007-03-31 0.0961
## 2007-06-30 0.0975
## 2007-09-30 0.0966
## 2007-12-31 0.0994
## 2008-03-31 0.1063
## 2008-06-30 0.1097
## 2008-09-30 0.1066
## 2008-12-31 0.1046
## 2009-03-31 0.1053
## 2009-06-30 0.1008
## 2009-08-31 0.1002
# Extract the optimal weights for the single period optimization
extractWeights(opt)
## Convertible Arbitrage CTA Global Distressed Securities
## 0.0769 0.0769 0.0769
## Emerging Markets Equity Market Neutral Event Driven
## 0.0769 0.0769 0.0769
## Fixed Income Arbitrage Global Macro Long/Short Equity
## 0.0769 0.0769 0.0769
## Merger Arbitrage Relative Value Short Selling
## 0.0769 0.0769 0.0769
## Funds of Funds
## 0.0769
# Chart the weights for the single period optimization
chart.Weights(opt)
# Extract the optimal weights for the optimization backtest
extractWeights(opt_rebal)
## Convertible Arbitrage CTA Global Distressed Securities
## 2001-12-31 0.0769 0.0769 0.0769
## 2002-03-31 0.0769 0.0769 0.0769
## 2002-06-30 0.0769 0.0769 0.0769
## 2002-09-30 0.1560 0.2380 0.0740
## 2002-12-31 0.1560 0.2380 0.0740
## 2003-03-31 0.0769 0.0769 0.0769
## 2003-06-30 0.1560 0.2380 0.0740
## 2003-09-30 0.0769 0.0769 0.0769
## 2003-12-31 0.0769 0.0769 0.0769
## 2004-03-31 0.0769 0.0769 0.0769
## 2004-06-30 0.0769 0.0769 0.0769
## 2004-09-30 0.0769 0.0769 0.0769
## 2004-12-31 0.0769 0.0769 0.0769
## 2005-03-31 0.0769 0.0769 0.0769
## 2005-06-30 0.0769 0.0769 0.0769
## 2005-09-30 0.0769 0.0769 0.0769
## 2005-12-31 0.0769 0.0769 0.0769
## 2006-03-31 0.0769 0.0769 0.0769
## 2006-06-30 0.0769 0.0769 0.0769
## 2006-09-30 0.0769 0.0769 0.0769
## 2006-12-31 0.0769 0.0769 0.0769
## 2007-03-31 0.0769 0.0769 0.0769
## 2007-06-30 0.0769 0.0769 0.0769
## 2007-09-30 0.0769 0.0769 0.0769
## 2007-12-31 0.0769 0.0769 0.0769
## 2008-03-31 0.0769 0.0769 0.0769
## 2008-06-30 0.0769 0.0769 0.0769
## 2008-09-30 0.0769 0.0769 0.0769
## 2008-12-31 0.0769 0.0769 0.0769
## 2009-03-31 0.0769 0.0769 0.0769
## 2009-06-30 0.0769 0.0769 0.0769
## 2009-08-31 0.0769 0.0769 0.0769
## Emerging Markets Equity Market Neutral Event Driven
## 2001-12-31 0.0769 0.0769 0.0769
## 2002-03-31 0.0769 0.0769 0.0769
## 2002-06-30 0.0769 0.0769 0.0769
## 2002-09-30 0.0440 0.1360 0.0000
## 2002-12-31 0.0440 0.1360 0.0000
## 2003-03-31 0.0769 0.0769 0.0769
## 2003-06-30 0.0440 0.1360 0.0000
## 2003-09-30 0.0769 0.0769 0.0769
## 2003-12-31 0.0769 0.0769 0.0769
## 2004-03-31 0.0769 0.0769 0.0769
## 2004-06-30 0.0769 0.0769 0.0769
## 2004-09-30 0.0769 0.0769 0.0769
## 2004-12-31 0.0769 0.0769 0.0769
## 2005-03-31 0.0769 0.0769 0.0769
## 2005-06-30 0.0769 0.0769 0.0769
## 2005-09-30 0.0769 0.0769 0.0769
## 2005-12-31 0.0769 0.0769 0.0769
## 2006-03-31 0.0769 0.0769 0.0769
## 2006-06-30 0.0769 0.0769 0.0769
## 2006-09-30 0.0769 0.0769 0.0769
## 2006-12-31 0.0769 0.0769 0.0769
## 2007-03-31 0.0769 0.0769 0.0769
## 2007-06-30 0.0769 0.0769 0.0769
## 2007-09-30 0.0769 0.0769 0.0769
## 2007-12-31 0.0769 0.0769 0.0769
## 2008-03-31 0.0769 0.0769 0.0769
## 2008-06-30 0.0769 0.0769 0.0769
## 2008-09-30 0.0769 0.0769 0.0769
## 2008-12-31 0.0769 0.0769 0.0769
## 2009-03-31 0.0769 0.0769 0.0769
## 2009-06-30 0.0769 0.0769 0.0769
## 2009-08-31 0.0769 0.0769 0.0769
## Fixed Income Arbitrage Global Macro Long/Short Equity
## 2001-12-31 0.0769 0.0769 0.0769
## 2002-03-31 0.0769 0.0769 0.0769
## 2002-06-30 0.0769 0.0769 0.0769
## 2002-09-30 0.0820 0.1380 0.0200
## 2002-12-31 0.0820 0.1380 0.0200
## 2003-03-31 0.0769 0.0769 0.0769
## 2003-06-30 0.0820 0.1380 0.0200
## 2003-09-30 0.0769 0.0769 0.0769
## 2003-12-31 0.0769 0.0769 0.0769
## 2004-03-31 0.0769 0.0769 0.0769
## 2004-06-30 0.0769 0.0769 0.0769
## 2004-09-30 0.0769 0.0769 0.0769
## 2004-12-31 0.0769 0.0769 0.0769
## 2005-03-31 0.0769 0.0769 0.0769
## 2005-06-30 0.0769 0.0769 0.0769
## 2005-09-30 0.0769 0.0769 0.0769
## 2005-12-31 0.0769 0.0769 0.0769
## 2006-03-31 0.0769 0.0769 0.0769
## 2006-06-30 0.0769 0.0769 0.0769
## 2006-09-30 0.0769 0.0769 0.0769
## 2006-12-31 0.0769 0.0769 0.0769
## 2007-03-31 0.0769 0.0769 0.0769
## 2007-06-30 0.0769 0.0769 0.0769
## 2007-09-30 0.0769 0.0769 0.0769
## 2007-12-31 0.0769 0.0769 0.0769
## 2008-03-31 0.0769 0.0769 0.0769
## 2008-06-30 0.0769 0.0769 0.0769
## 2008-09-30 0.0769 0.0769 0.0769
## 2008-12-31 0.0769 0.0769 0.0769
## 2009-03-31 0.0769 0.0769 0.0769
## 2009-06-30 0.0769 0.0769 0.0769
## 2009-08-31 0.0769 0.0769 0.0769
## Merger Arbitrage Relative Value Short Selling Funds of Funds
## 2001-12-31 0.0769 0.0769 0.0769 0.0769
## 2002-03-31 0.0769 0.0769 0.0769 0.0769
## 2002-06-30 0.0769 0.0769 0.0769 0.0769
## 2002-09-30 0.1060 0.0000 0.0060 0.0000
## 2002-12-31 0.1060 0.0000 0.0060 0.0000
## 2003-03-31 0.0769 0.0769 0.0769 0.0769
## 2003-06-30 0.1060 0.0000 0.0060 0.0000
## 2003-09-30 0.0769 0.0769 0.0769 0.0769
## 2003-12-31 0.0769 0.0769 0.0769 0.0769
## 2004-03-31 0.0769 0.0769 0.0769 0.0769
## 2004-06-30 0.0769 0.0769 0.0769 0.0769
## 2004-09-30 0.0769 0.0769 0.0769 0.0769
## 2004-12-31 0.0769 0.0769 0.0769 0.0769
## 2005-03-31 0.0769 0.0769 0.0769 0.0769
## 2005-06-30 0.0769 0.0769 0.0769 0.0769
## 2005-09-30 0.0769 0.0769 0.0769 0.0769
## 2005-12-31 0.0769 0.0769 0.0769 0.0769
## 2006-03-31 0.0769 0.0769 0.0769 0.0769
## 2006-06-30 0.0769 0.0769 0.0769 0.0769
## 2006-09-30 0.0769 0.0769 0.0769 0.0769
## 2006-12-31 0.0769 0.0769 0.0769 0.0769
## 2007-03-31 0.0769 0.0769 0.0769 0.0769
## 2007-06-30 0.0769 0.0769 0.0769 0.0769
## 2007-09-30 0.0769 0.0769 0.0769 0.0769
## 2007-12-31 0.0769 0.0769 0.0769 0.0769
## 2008-03-31 0.0769 0.0769 0.0769 0.0769
## 2008-06-30 0.0769 0.0769 0.0769 0.0769
## 2008-09-30 0.0769 0.0769 0.0769 0.0769
## 2008-12-31 0.0769 0.0769 0.0769 0.0769
## 2009-03-31 0.0769 0.0769 0.0769 0.0769
## 2009-06-30 0.0769 0.0769 0.0769 0.0769
## 2009-08-31 0.0769 0.0769 0.0769 0.0769
# Chart the weights for the optimization backtest
chart.Weights(opt_rebal)
Chapter 3 - Objective Functions and Moment Estimation
Introduction to Moments:
Custom Moment Functions:
Objective Functions:
Example code includes:
# Add a return objective with "mean" as the objective name
port_spec <- add.objective(portfolio = port_spec, type = "return", name = "mean")
# Calculate the sample moments
moments <- set.portfolio.moments(R = asset_returns, portfolio = port_spec)
# Check if moments$mu is equal to the sample estimate of mean returns
moments$mu == colMeans(asset_returns)
## [,1]
## [1,] TRUE
## [2,] TRUE
## [3,] TRUE
## [4,] TRUE
## [5,] TRUE
## [6,] TRUE
## [7,] TRUE
## [8,] TRUE
## [9,] TRUE
## [10,] TRUE
## [11,] TRUE
## [12,] TRUE
## [13,] TRUE
# Add a risk objective with "StdDev" as the objective name
port_spec <- add.objective(portfolio = port_spec, type = "risk", name = "StdDev")
# Calculate the sample moments using set.portfolio.moments. Assign to a variable named moments.
moments <- set.portfolio.moments(R = asset_returns, portfolio = port_spec)
# Check if moments$sigma is equal to the sample estimate of the variance-covariance matrix
all.equal(moments$sigma, cov(asset_returns))
## [1] TRUE
# Print the portfolio specification object
print(port_spec)
## **************************************************
## PortfolioAnalytics Portfolio Specification
## **************************************************
##
## Call:
## portfolio.spec(assets = asset_names)
##
## Number of assets: 13
## Asset Names
## [1] "Convertible Arbitrage" "CTA Global" "Distressed Securities"
## [4] "Emerging Markets" "Equity Market Neutral" "Event Driven"
## [7] "Fixed Income Arbitrage" "Global Macro" "Long/Short Equity"
## [10] "Merger Arbitrage"
## More than 10 assets, only printing the first 10
##
## Constraints
## Enabled constraint types
## - weight_sum
## - long_only
##
## Objectives:
## Enabled objective names
## - mean
## - StdDev
## - StdDev
## - mean
## - StdDev
# Fit a statistical factor model to the asset returns
fit <- statistical.factor.model(R = asset_returns, k = 3)
# Estimate the portfolio moments using the "boudt" method with 3 factors
moments_boudt <- set.portfolio.moments(R = asset_returns, portfolio = port_spec, method = "boudt", k = 3)
# Check if the covariance matrix extracted from the model fit is equal to the estimate in `moments_boudt`
all.equal(moments_boudt$sigma, extractCovariance(fit))
## [1] TRUE
# Define custom moment function
moments_robust <- function(R, portfolio, seed=NULL){
out <- list()
if (is.null(seed)) {
out$sigma <- MASS::cov.rob(R, method = "mcd")$cov
} else {
out$sigma <- MASS::cov.rob(R, method = "mcd", seed=seed)$cov
}
out
}
# Estimate the portfolio moments using the function you just defined
moments <- moments_robust(R = asset_returns, portfolio = port_spec)
# Check the moment estimate (will be differences due to RNG)
all.equal(MASS::cov.rob(asset_returns, method = "mcd")$cov, moments$sigma)
## [1] "Mean relative difference: 0.0966"
# Estimate the portfolio moments using the function you just defined
set.seed(19101508)
moments <- moments_robust(R = asset_returns, portfolio = port_spec)
# Check the moment estimate (no more differences due to same seed)
set.seed(19101508)
all.equal(MASS::cov.rob(asset_returns, method = "mcd")$cov, moments$sigma)
## [1] TRUE
# Create a portfolio specification object using asset_names
port_spec <- portfolio.spec(assets=asset_names)
port_spec <- add.constraint(portfolio = port_spec, type = "full_investment")
port_spec <- add.constraint(portfolio = port_spec, type = "long_only")
port_spec <- add.objective(portfolio = port_spec, type = "risk", name = "StdDev")
port_spec
## **************************************************
## PortfolioAnalytics Portfolio Specification
## **************************************************
##
## Call:
## portfolio.spec(assets = asset_names)
##
## Number of assets: 13
## Asset Names
## [1] "Convertible Arbitrage" "CTA Global" "Distressed Securities"
## [4] "Emerging Markets" "Equity Market Neutral" "Event Driven"
## [7] "Fixed Income Arbitrage" "Global Macro" "Long/Short Equity"
## [10] "Merger Arbitrage"
## More than 10 assets, only printing the first 10
##
## Constraints
## Enabled constraint types
## - full_investment
## - long_only
##
## Objectives:
## Enabled objective names
## - StdDev
rpData <- c(0.0769, 0.004, 0.014, 0.02, 0.138, 0.004, 0.008, 0, 0.012, 0.018, 0.062, 0.066, 0.176, 0.25, 0, 0.032, 0.1, 0.002, 0.194, 0.326, 0.004, 0.126, 0, 0.036, 0.304, 0.004, 0.002, 0, 0.036, 0, 0, 0.004, 0.146, 0.04, 0.01, 0.192, 0.272, 0.086, 0.074, 0.026, 0.092, 0.024, 0.102, 0, 0.098, 0.102, 0.14, 0.0769, 0.006, 0.046, 0.016, 0, 0.096, 0.146, 0.594, 0.17, 0.16, 0.324, 0.022, 0, 0.054, 0.01, 0.024, 0, 0.002, 0.168, 0.106, 0, 0.002, 0.012, 0.07, 0, 0.388, 0.114, 0.046, 0, 0.406, 0.046, 0.244, 0.254, 0.114, 0.118, 0.07, 0.012, 0, 0.12, 0.002, 0, 0.014, 0, 0.008, 0.048, 0.072, 0.07, 0.0769, 0.02, 0.162, 0.01, 0.014, 0.066, 0, 0.032, 0.002, 0.01, 0.092, 0.026, 0.016, 0, 0.074, 0, 0.012, 0.598, 0.032, 0, 0.042, 0.048, 0.424, 0, 0, 0, 0.622, 0.002, 0.006, 0, 0.036, 0.034, 0, 0.088, 0.002, 0.126, 0.034, 0.07, 0.01, 0.002, 0.17, 0.54, 0.108, 0.002, 0.008, 0.016, 0.124, 0.0769, 0.03, 0.138, 0, 0, 0.096, 0.004, 0, 0, 0, 0.008, 0.05, 0, 0, 0.002, 0.022, 0.018, 0.004, 0.052, 0.02, 0.032, 0.004, 0, 0, 0, 0.276, 0.012, 0.028, 0, 0.004, 0, 0.052, 0, 0, 0.006, 0.034, 0.158, 0, 0.002, 0.004, 0.028, 0.174, 0, 0.178, 0.13, 0.066, 0.054, 0.0769, 0.02, 0.012, 0.014, 0.242, 0.072, 0.54, 0.022, 0.032, 0.07, 0.006, 0, 0.002, 0.02, 0, 0.014, 0.132, 0, 0.05, 0, 0.002, 0.074, 0.004, 0.082, 0, 0.024, 0.008, 0.02, 0.088, 0.168, 0.24, 0.158, 0.116, 0.132, 0.002, 0.022, 0.006, 0.004, 0.008, 0.008, 0.054, 0.01, 0, 0, 0.118, 0.448, 0.18, 0.0769, 0.004, 0.01, 0.116, 0, 0.248, 0.036, 0.208, 0, 0.178, 0.008, 0.16, 0, 0.108, 0, 0.028, 0.048, 0, 0.052, 0, 0.068, 0, 0.25, 0.114, 0.214, 0.044, 0.006, 0.082, 0.76, 0.144, 0.282, 0.028, 0.074, 0.044, 0.598, 0.164, 0.226, 0, 0.264, 0.052, 0.152, 0.002, 0.054, 0.618, 0.552, 0.014, 0, 0.0769, 0, 0.614, 0.484, 0, 0.006, 0, 0.002, 0, 0.07, 0, 0.196, 0.056, 0.004, 0, 0.47, 0, 0, 0.076, 0, 0.18, 0.008, 0.07, 0, 0, 0, 0.04, 0.002, 0, 0.028, 0.016, 0.006, 0.032, 0.002, 0, 0.174, 0.034, 0, 0.04, 0.006, 0, 0.034, 0.014, 0.092, 0.012, 0.112, 0.114, 0.0769, 0.752, 0, 0.116, 0.494, 0.142, 0, 0.138, 0.54, 0.004, 0.004, 0.072, 0, 0.43, 0.212, 0.058, 0.36, 0.014, 0.002, 0, 0.012, 0.308, 0.066, 0.47, 0, 0.002, 0.004, 0.32, 0.004, 0.032, 0.008, 0.036, 0.002, 0.012, 0.008, 0.056, 0.186, 0, 0.286, 0.02, 0, 0.018, 0.02, 0.028, 0, 0.018, 0.008, 0.0769, 0, 0.004, 0.026, 0, 0.086, 0.14, 0, 0.006, 0, 0.266, 0.004, 0.126, 0.02, 0.066, 0.136, 0, 0, 0.006, 0.08, 0.53, 0.006, 0.024, 0, 0.336, 0.002, 0.018, 0.174, 0, 0.032, 0, 0.09, 0.018, 0.022, 0.04, 0.038, 0.006, 0.508, 0, 0.008, 0.076, 0.01, 0.084, 0.012, 0.008, 0.026, 0.012, 0.0769, 0, 0, 0, 0.012, 0.016, 0.026, 0, 0.012, 0.132, 0, 0.008, 0.196, 0, 0, 0, 0.01, 0, 0.002, 0, 0.084, 0.042, 0.01, 0.018, 0.106, 0.24, 0.032, 0.018, 0.008, 0.088, 0.01, 0.088, 0, 0.014, 0.016, 0.048, 0.026, 0.124, 0.006, 0.13, 0.184, 0.038, 0.256, 0.03, 0.024, 0.028, 0.002, 0.0769, 0.004, 0, 0.186, 0, 0.006, 0.036, 0, 0.054, 0, 0.032, 0.254, 0, 0.112, 0.602, 0.03, 0.01, 0.078, 0.038, 0.454, 0.012, 0.034, 0.07, 0.21, 0, 0.01, 0.006, 0.282, 0, 0.068, 0.002, 0.236, 0.038, 0.044, 0.018, 0.072, 0.026, 0.028, 0.078, 0.134, 0.16, 0.13, 0.064, 0, 0, 0.004, 0.05, 0.0769, 0.002, 0, 0.01, 0, 0.154, 0.056, 0.004, 0.05, 0.204, 0.022, 0.12, 0, 0.002, 0.034, 0.154, 0.016, 0.288, 0.238, 0.004, 0.02, 0.284, 0.056, 0, 0, 0.008, 0.114, 0.004, 0.086, 0.012, 0.102, 0.02, 0.314, 0.092, 0.182, 0.002, 0.012, 0, 0.106, 0.258, 0.032, 0, 0.298, 0.032, 0, 0.08, 0.098, 0.0769, 0.158, 0, 0.002, 0.1, 0.008, 0.008, 0, 0.122, 0.154, 0.176, 0.022, 0.428, 0, 0, 0.032, 0.294, 0.014, 0.09, 0.01, 0.014, 0.064, 0.014, 0, 0.04, 0.002, 0.022, 0.022, 0.012, 0.018, 0.258, 0.004, 0.006, 0.396, 0, 0.002, 0.002, 0.18, 0.006, 0.35, 0.052, 0.006, 0, 0, 0.002, 0.014, 0.148)
rp <- matrix(rpData, ncol=13)
rp <- t(apply(rp, 1, FUN=function(x) { x/sum(x) }))
dimnames(rp) <- list(NULL, nameData)
str(rp)
## num [1:47, 1:13] 0.0769 0.004 0.014 0.02 0.138 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:13] "Convertible Arbitrage" "CTA Global" "Distressed Securities" "Emerging Markets" ...
# Run the optimization with custom moment estimates
opt_custom <- optimize.portfolio(R = asset_returns, portfolio = port_spec,
optimize_method = "random", rp = rp, momentFUN = "moments_robust"
)
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
# Print the results of the optimization with custom moment estimates
print(opt_custom)
## ***********************************
## PortfolioAnalytics Optimization
## ***********************************
##
## Call:
## optimize.portfolio(R = asset_returns, portfolio = port_spec,
## optimize_method = "random", rp = rp, momentFUN = "moments_robust")
##
## Optimal Weights:
## Convertible Arbitrage CTA Global Distressed Securities
## 0.032 0.024 0.000
## Emerging Markets Equity Market Neutral Event Driven
## 0.022 0.014 0.028
## Fixed Income Arbitrage Global Macro Long/Short Equity
## 0.470 0.058 0.136
## Merger Arbitrage Relative Value Short Selling
## 0.000 0.030 0.154
## Funds of Funds
## 0.032
##
## Objective Measures:
## StdDev
## 0.005933
# Run the optimization with sample moment estimates
opt_sample <- optimize.portfolio(R = asset_returns, portfolio = port_spec,
optimize_method = "random", rp = rp
)
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
# Print the results of the optimization with sample moment estimates
print(opt_sample)
## ***********************************
## PortfolioAnalytics Optimization
## ***********************************
##
## Call:
## optimize.portfolio(R = asset_returns, portfolio = port_spec,
## optimize_method = "random", rp = rp)
##
## Optimal Weights:
## Convertible Arbitrage CTA Global Distressed Securities
## 0.008 0.146 0.000
## Emerging Markets Equity Market Neutral Event Driven
## 0.004 0.540 0.036
## Fixed Income Arbitrage Global Macro Long/Short Equity
## 0.000 0.000 0.140
## Merger Arbitrage Relative Value Short Selling
## 0.026 0.036 0.056
## Funds of Funds
## 0.008
##
## Objective Measures:
## StdDev
## 0.008842
# Custom annualized portfolio standard deviation
pasd <- function(R, weights, sigma, scale = 12){
sqrt(as.numeric(t(weights) %*% sigma %*% weights)) * sqrt(scale)
}
# Add custom objective to portfolio specification
port_spec <- add.objective(portfolio = port_spec, type = "risk", name = "pasd")
# Print the portfolio specificaton object
port_spec
## **************************************************
## PortfolioAnalytics Portfolio Specification
## **************************************************
##
## Call:
## portfolio.spec(assets = asset_names)
##
## Number of assets: 13
## Asset Names
## [1] "Convertible Arbitrage" "CTA Global" "Distressed Securities"
## [4] "Emerging Markets" "Equity Market Neutral" "Event Driven"
## [7] "Fixed Income Arbitrage" "Global Macro" "Long/Short Equity"
## [10] "Merger Arbitrage"
## More than 10 assets, only printing the first 10
##
## Constraints
## Enabled constraint types
## - full_investment
## - long_only
##
## Objectives:
## Enabled objective names
## - StdDev
## - pasd
set_sigma <- function(R){
out <- list()
out$sigma <- cov(R)
out
}
opt <- optimize.portfolio(R = asset_returns, portfolio = port_spec, momentFUN = set_sigma,
optimize_method = "random", rp = rp
)
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
# Print the results of the optimization
print(opt)
## ***********************************
## PortfolioAnalytics Optimization
## ***********************************
##
## Call:
## optimize.portfolio(R = asset_returns, portfolio = port_spec,
## optimize_method = "random", rp = rp, momentFUN = set_sigma)
##
## Optimal Weights:
## Convertible Arbitrage CTA Global Distressed Securities
## 0.008 0.146 0.000
## Emerging Markets Equity Market Neutral Event Driven
## 0.004 0.540 0.036
## Fixed Income Arbitrage Global Macro Long/Short Equity
## 0.000 0.000 0.140
## Merger Arbitrage Relative Value Short Selling
## 0.026 0.036 0.056
## Funds of Funds
## 0.008
##
## Objective Measures:
## StdDev
## 0.008842
##
##
## pasd
## 0.03063
Chapter 4 - Application
Applications:
Optimization Backtest:
Wrap up:
Example code includes:
par(mfrow=c(1, 1))
par(mfcol=c(1, 1))
par(mar=c(5, 4, 4, 2) + 0.1)
library(PortfolioAnalytics)
# Load the data
data(edhec)
str(edhec)
## An 'xts' object on 1997-01-31/2009-08-31 containing:
## Data: num [1:152, 1:13] 0.0119 0.0123 0.0078 0.0086 0.0156 0.0212 0.0193 0.0134 0.0122 0.01 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:13] "Convertible Arbitrage" "CTA Global" "Distressed Securities" "Emerging Markets" ...
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## NULL
# Assign the data to a variable
asset_returns <- edhec
# Create a vector of equal weights
equal_weights <- rep(1 / ncol(asset_returns), ncol(asset_returns))
# Compute the benchmark returns
r_benchmark <- Return.portfolio(R = asset_returns, weights = equal_weights, rebalance_on = "quarters")
colnames(r_benchmark) <- "benchmark"
# Plot the benchmark returns
plot(r_benchmark)
# Create the portfolio specification
port_spec <- portfolio.spec(assets=colnames(asset_returns))
# Add a full investment constraint such that the weights sum to 1
port_spec <- add.constraint(portfolio=port_spec, type="full_investment")
# Add a long only constraint such that the weight of an asset is between 0 and 1
port_spec <- add.constraint(portfolio=port_spec, type="long_only")
# Add an objective to minimize portfolio standard deviation
port_spec <- add.objective(portfolio=port_spec, type="risk", name="StdDev")
# Print the portfolio specification
port_spec
## **************************************************
## PortfolioAnalytics Portfolio Specification
## **************************************************
##
## Call:
## portfolio.spec(assets = colnames(asset_returns))
##
## Number of assets: 13
## Asset Names
## [1] "Convertible Arbitrage" "CTA Global" "Distressed Securities"
## [4] "Emerging Markets" "Equity Market Neutral" "Event Driven"
## [7] "Fixed Income Arbitrage" "Global Macro" "Long/Short Equity"
## [10] "Merger Arbitrage"
## More than 10 assets, only printing the first 10
##
## Constraints
## Enabled constraint types
## - full_investment
## - long_only
##
## Objectives:
## Enabled objective names
## - StdDev
# Run the optimization
opt_rebal_base <- optimize.portfolio.rebalancing(R = asset_returns, portfolio = port_spec,
optimize_method = "ROI", rebalance_on = "quarters",
training_period = 60, rolling_window = 60
)
# Print the results
print(opt_rebal_base)
## **************************************************
## PortfolioAnalytics Optimization with Rebalancing
## **************************************************
##
## Call:
## optimize.portfolio.rebalancing(R = asset_returns, portfolio = port_spec,
## optimize_method = "ROI", rebalance_on = "quarters", training_period = 60,
## rolling_window = 60)
##
## Number of rebalancing dates: 32
## First rebalance date:
## [1] "2001-12-31"
## Last rebalance date:
## [1] "2009-08-31"
##
## Annualized Portfolio Rebalancing Return:
## [1] 0.0513
##
## Annualized Portfolio Standard Deviation:
## [1] 0.0205
# Chart the weights
chart.Weights(opt_rebal_base)
# Compute the portfolio returns
returns_base <- Return.portfolio(R = asset_returns, weights = extractWeights(opt_rebal_base))
colnames(returns_base) <- "base"
# Add a risk budge objective
port_spec <- add.objective(portfolio = port_spec, type = "risk_budget", name = "StdDev",
min_prisk = 0.05, max_prisk = 0.1
)
# Run the optimization
opt_rebal_rb <- optimize.portfolio.rebalancing(R = asset_returns, portfolio = port_spec,
optimize_method = "random", rp =rp, trace = TRUE,
rebalance_on = "quarters", training_period = 60,
rolling_window = 60
)
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
# Chart the weights
chart.Weights(opt_rebal_rb)
# Chart the percentage contribution to risk
chart.RiskBudget(opt_rebal_rb, match.col = "StdDev", risk.type = "percentage")
# Compute the portfolio returns
returns_rb <- Return.portfolio(R = asset_returns, weights = extractWeights(opt_rebal_rb))
colnames(returns_rb) <- "risk_budget"
# Run the optimization
opt_rebal_rb_robust <- optimize.portfolio.rebalancing(R = asset_returns, momentFUN = "moments_robust",
portfolio = port_spec, optimize_method = "random",
rp = rp, trace = TRUE, rebalance_on = "quarters",
training_period = 60, rolling_window = 60
)
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
## Leverage constraint min_sum and max_sum are restrictive,
## consider relaxing. e.g. 'full_investment' constraint should be min_sum=0.99 and max_sum=1.01
# Chart the weights
chart.Weights(opt_rebal_rb_robust)
# Chart the percentage contribution to risk
chart.RiskBudget(opt_rebal_rb_robust, match.col = "StdDev", risk.type = "percentage")
# Compute the portfolio returns
returns_rb_robust <- Return.portfolio(R = asset_returns, weights = extractWeights(opt_rebal_rb_robust))
colnames(returns_rb_robust) <- "rb_robust"
# Combine the returns
ret <- cbind(r_benchmark, returns_base, returns_rb, returns_rb_robust)
# Compute annualized returns
table.AnnualizedReturns(R = ret)
## benchmark base risk_budget rb_robust
## Annualized Return 0.0826 0.0513 0.0631 0.0660
## Annualized Std Dev 0.0391 0.0205 0.0417 0.0426
## Annualized Sharpe (Rf=0%) 2.1121 2.5014 1.5152 1.5489
# Chart the performance summary
charts.PerformanceSummary(R = ret)
Chapter 1 - Introduction and Plain Vanilla Bond Valuation
Introduction - Clifford Ang, author of books in the area:
Time Value of Money (TVM) - generally, a dollar today is worth more than a dollar tomorrow:
Bond Valuation:
Convert Code to Functions:
Example code includes:
# Create pv
pv <- 100
# Create r
r <- 0.1
# Calculate fv1
fv1 <- pv * (1 + r)
# Calculate fv2
fv2 <- pv * (1+r)**2
# Calculate pv1
pv1 <- fv1 / (1 + r)
# Calculate pv2
pv2 <- fv2 / (1 + r)^2
# Print pv1 and pv2
print(pv1)
## [1] 100
print(pv2)
## [1] 100
# Create vector of cash flows
cf <- c(5, 5, 5, 5, 105)
# Convert to data frame
cf <- data.frame(cf)
cf
## cf
## 1 5
## 2 5
## 3 5
## 4 5
## 5 105
# Add column t to cf
cf$t <- as.numeric(rownames(cf))
# Calculate pv_factor
cf$pv_factor <- 1 / (1 + 0.06)^cf$t
# Calculate pv
cf$pv <- cf$cf * cf$pv_factor
# Calculate the bond price
sum(cf$pv)
## [1] 95.8
# Create function
bondprc <- function(p, r, ttm, y) {
cf <- c(rep(p * r, ttm - 1), p * (1 + r))
cf <- data.frame(cf)
cf$t <- as.numeric(rownames(cf))
cf$pv_factor <- 1 / (1 + y)^cf$t
cf$pv <- cf$cf * cf$pv_factor
sum(cf$pv)
}
# Verify prior result
bondprc(100, 0.05, 5, 0.06)
## [1] 95.8
Chapter 2 - Yield to Maturity
Price-Yield Relationship:
Components of Yield:
Estimating Yield of a Bond:
Example code includes:
# Load Quandl package
library(Quandl)
# Obtain Moody's Baa index data
baa <- Quandl("FED/RIMLPBAAR_N_M")
# Identify 9/30/16 yield
baa_yield <- subset(baa, baa$Date == "2016-09-30")
# Convert yield to decimals and view
baa_yield <- baa_yield$Value/100
baa_yield
## [1] 0.0431
bondprc(p = 100, r = 0.05, ttm = 5, y = baa_yield)
## [1] 103
# Generate prc_yld
prc_yld <- seq(0.02, 0.4, by=0.01)
# Convert prc_yld to data frame
prc_yld <- data.frame(prc_yld)
# Calculate bond price given different yields
for (i in 1:nrow(prc_yld)) {
prc_yld$price[i] <- bondprc(100, 0.10, 20, prc_yld$prc_yld[i])
}
# Plot P/YTM relationship
plot(prc_yld, type = "l", col = "blue", main = "Price/YTM Relationship")
# Generate prc_yld
prc_yld <- seq(0.02, 0.4, by=0.01)
# Convert prc_yld to data frame
prc_yld <- data.frame(prc_yld)
# Calculate bond price given different yields
for (i in 1:nrow(prc_yld)) {
prc_yld$price[i] <- bondprc(100, 0.10, 20, prc_yld$prc_yld[i])
}
# Plot P/YTM relationship
plot(prc_yld, type = "l", col = "blue", main = "Price/YTM Relationship")
# Examine first and last six elements in spread
# head(spread)
# tail(spread)
# Calculate spread$diff
# spread$diff <- 100 * (spread$baa - spread$aaa)
# Plot spread
# plot(x = spread$date, y = spread$diff, type = "l", xlab = "Date", ylab = "Spread (bps)",
# col = "red", main = "Baa - Aaa Spread"
# )
# Value bond using 5% yield
bondprc(p = 100, r = 0.05, ttm = 5, y = 0.05)
## [1] 100
# Value bond using 7% yield
bondprc(p = 100, r = 0.05, ttm = 5, y = 0.07)
## [1] 91.8
# Value bond using 6% yield
bondprc(p = 100, r = 0.05, ttm = 5, y = 0.06)
## [1] 95.8
# Create cash flow vector
cf <- c(-95.79, 5, 5, 5, 5, 105)
# Create bond valuation function
bval <- function(i, cf, t=seq(along = cf)) sum(cf / (1 + i)^t)
# Create ytm() function using uniroot
ytm <- function(cf) {
uniroot(bval, c(0, 1), cf = cf)$root
}
# Use ytm() function to find yield
ytm(cf)
## [1] 0.06
Chapter 3 - Duration and Convexity
Bond price volatility and Price Value of Basis Point:
Duration - estimated price change for a 100 basis point change in yield:
Convexity - curvature is not well captured by duration:
Example code includes:
# Calculate the PV01
PV01 <- abs( bondprc(p=100, r=0.1, ttm=20, y=0.10) - bondprc(p=100, r=0.1, ttm=20, y=0.1001) )
# Calculate bond price today
px <- bondprc(p = 100, r = 0.1, ttm = 20, y = 0.1)
px
## [1] 100
# Calculate bond price if yields increase by 1%
px_up <- bondprc(p = 100, r = 0.1, ttm = 20, y = 0.11)
px_up
## [1] 92
# Calculate bond price if yields decrease by 1%
px_down <- bondprc(p = 100, r = 0.1, ttm = 20, y = 0.09)
px_down
## [1] 109
# Calculate approximate duration
duration <- (px_down - px_up) / (2 * px * 0.01)
duration
## [1] 8.55
# Estimate percentage change
duration_pct_change <- -duration * -0.01
duration_pct_change
## [1] 0.0855
# Estimate dollar change
duration_dollar_change <- duration_pct_change * px
duration_dollar_change
## [1] 8.55
# Calculate approximate convexity
convexity <- (px_up + px_down - 2 * px) / (px * (0.01)^2)
convexity
## [1] 117
# Estimate percentage change
convexity_pct_change <- 0.5 * convexity * (0.01)^2
convexity_pct_change
## [1] 0.00583
# Estimate dollar change
convexity_dollar_change <- convexity_pct_change * px
convexity_dollar_change
## [1] 0.583
# Estimate change in price
price_change <- duration_dollar_change + convexity_dollar_change
# Estimate price
price <- px + duration_dollar_change + convexity_dollar_change
Chapter 4 - Comprehensive Example
Summarizing main lessons:
Duration and convexity:
Wrap up:
Example code includes:
# Load Quandl package
library(Quandl)
# Obtain Moody's Aaa yield
aaa <- Quandl("FED/RIMLPAAAR_N_M")
# identify yield on September 30, 2016
aaa_yield <- subset(aaa, aaa$Date == "2016-09-30")
# Convert yield into decimals
aaa_yield <- as.numeric(aaa_yield$Value) / 100
aaa_yield
## [1] 0.0341
# Layout the bond's cash flows
cf <- c(3, 3, 3, 3, 3, 3, 3, 103)
# Convert to data.frame
cf <- data.frame(cf)
# Add time indicator
cf$t <- seq(1, 8, by=1)
# Calculate PV factor
cf$pv_factor <- 1 / (1 + aaa_yield)^cf$t
# Calculate PV
cf$pv <- cf$cf * cf$pv_factor
# Price bond
sum(cf$pv)
## [1] 97.2
# Code cash flow function
alt_cf <- function(r, p, ttm) {
c(rep(p * r, ttm - 1), p * (1 + r))
}
# Generate cf vector
alt_cf(r = 0.03, p = 100, ttm = 8)
## [1] 3 3 3 3 3 3 3 103
# Calculate bond price when yield increases
px_up <- bondprc(p = 100, r = 0.03, ttm = 8, y = aaa_yield+0.01)
# Calculate bond price when yield decreases
px_down <- bondprc(p = 100, r = 0.03, ttm = 8, y = aaa_yield-0.01)
# Calculate duration
duration <- (px_down - px_up) / (2 * px * 0.01)
# Calculate percentage effect of duration on price
duration_pct_change <- -duration * 0.01
duration_pct_change
## [1] -0.0679
# Calculate dollar effect of duration on price
duration_dollar_change <- duration_pct_change * px
duration_dollar_change
## [1] -6.79
# Calculate convexity measure
convexity <- (px_up + px_down - 2*px) / (px * 0.01^2)
# Calculate percentage effect of convexity on price
convexity_pct_change <- 0.5 * convexity * 0.01^2
convexity_pct_change
## [1] -0.0254
# Calculate dollar effect of convexity on price
convexity_dollar_change <- convexity_pct_change * px
convexity_dollar_change
## [1] -2.54
# Estimate price_change
price_change <- duration_dollar_change + convexity_dollar_change
price_change
## [1] -9.33
# Estimate new_price
new_price <- px + duration_dollar_change + convexity_dollar_change
new_price
## [1] 90.7
Chapter 1 - Present Value Apparoaches
Course introduction and fundamental valuation:
Free cash flow to equity model (FCFE):
Calculating equity value:
Example code includes:
fv <- 100
r <-0.05
# Calculate PV of $100 one year from now
pv_1 <- fv / (1+r)
pv_1
## [1] 95.2
# Calculate PV of $100 two years from now
pv_2 <- fv / (1+r)**2
pv_2
## [1] 90.7
# After-tax income differs from free cash flow to equity (FCFE) because it includes non-cash items and does not exclude any additional investments necessary to maintain the firm's operations and grow the firm based on its projections
after_tax_income <- c(22.8, 24, 30.6, 38.4, 43.2)
capex <- c(11, 11, 12, 14, 15)
depn_amort <- c(11, 12, 12, 14, 15)
incr_wc <- c(16, 16, 14, 14, 14)
# Calculate FCFE
fcfe <- after_tax_income - capex + depn_amort - incr_wc
fcfe
## [1] 6.8 9.0 16.6 24.4 29.2
g <- 0.034
ke <- 0.105
# Calculate the terminal value as of 2021
tv_2021 <- fcfe[length(fcfe)] * (1 + g)/ (ke - g)
tv_2021
## [1] 425
fcfe <- as.data.frame(fcfe)
rownames(fcfe) <- 2017:2021
# Add discount periods
fcfe$periods <- 1:5
fcfe
## fcfe periods
## 2017 6.8 1
## 2018 9.0 2
## 2019 16.6 3
## 2020 24.4 4
## 2021 29.2 5
# Calculate Present Value Factor
fcfe$pv_factor <- (1+ke)**(-fcfe$periods)
fcfe
## fcfe periods pv_factor
## 2017 6.8 1 0.905
## 2018 9.0 2 0.819
## 2019 16.6 3 0.741
## 2020 24.4 4 0.671
## 2021 29.2 5 0.607
# Calculate Present Value of each Cash Flow
fcfe$pv <- fcfe$fcfe * fcfe$pv_factor
fcfe
## fcfe periods pv_factor pv
## 2017 6.8 1 0.905 6.15
## 2018 9.0 2 0.819 7.37
## 2019 16.6 3 0.741 12.30
## 2020 24.4 4 0.671 16.37
## 2021 29.2 5 0.607 17.72
# Total Present Value
pv_fcfe <- sum(fcfe$pv)
pv_fcfe
## [1] 59.9
# Calculate Present Value
pv_tv <- tv_2021 * (1 + ke)**-5
pv_tv
## [1] 258
# Calculate Equity Value
eq_val <- pv_fcfe + pv_tv
eq_val
## [1] 318
shout <- 10
# Calculate Equity Value Per Share
eq_val_per_share <- eq_val / shout
eq_val_per_share
## [1] 31.8
Chapter 2 - Perpetuity Growth Rate, Analyzing Projections, Dividend Discount Model
Analyzing the projections:
Perpetuity growth rate (PGR) - “one of the most commonly abused inputs in the model”:
Dividend discount model:
Example code includes:
hist_rev <- c(86.8, 89, 93, 128.6, 176.4, 171.4, 214.2, 236, 0, 0, 0, 0, 0)
rev_proj <- c(0, 0, 0, 0, 0, 0, 0, 0, 193.2, 212.9, 225, 279.2, 295.9)
# Combine hist_rev and rev_proj
rev_split <- rbind(hist_rev, rev_proj)
# Rename the column headers
colnames(rev_split) <- seq(2009, 2021, 1)
# Create a bar plot of the data
barplot(rev_split, col = c("red", "blue"), main = "Historical vs. Projected Revenues")
legend("topleft", legend = c("Historical", "Projected"), fill = c("red", "blue"))
rev_all <- data.frame(rev_proj = pmax(hist_rev, rev_proj))
# Create a trend variable
rev_all$trend <- 1:13
# Create shift variable
rev_all$shift <- c(rep(0, 8), rep(1, 5))
str(rev_all)
## 'data.frame': 13 obs. of 3 variables:
## $ rev_proj: num 86.8 89 93 128.6 176.4 ...
## $ trend : int 1 2 3 4 5 6 7 8 9 10 ...
## $ shift : num 0 0 0 0 0 0 0 0 1 1 ...
# Run regression
reg <- lm(rev_proj ~ trend + shift, data=rev_all)
# Print regression summary
summary(reg)
##
## Call:
## lm(formula = rev_proj ~ trend + shift, data = rev_all)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.413 -8.821 -0.025 6.645 21.402
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.39 9.95 4.16 0.0019 **
## trend 24.01 1.92 12.48 2e-07 ***
## shift -64.23 14.79 -4.34 0.0015 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.9 on 10 degrees of freedom
## Multiple R-squared: 0.967, Adjusted R-squared: 0.96
## F-statistic: 145 on 2 and 10 DF, p-value: 4.08e-08
# Calculate reinvestment amount
reinvestment <- (capex + incr_wc - depn_amort)[5]
reinvestment
## [1] 14
# Calculate retention ratio
retention_ratio <- reinvestment / after_tax_income[5]
retention_ratio
## [1] 0.324
# Calculate expected growth rate
exp_growth_rate <- retention_ratio * ke
exp_growth_rate
## [1] 0.034
stated_value <- 25
div_rate <- 0.05
kp <- 0.1
# Calculate dividend of preferred stock
div <- stated_value * div_rate
div
## [1] 1.25
# Calculate value of preferred stock
pref_value <- div / kp
pref_value
## [1] 12.5
# Value of Preferred if dividends start five years from now
pref_value_yr5 <- (stated_value * div_rate) / kp
pref_value_yr5
## [1] 12.5
# Value discounted to present
pref_value <- pref_value_yr5 / (1 + kp)**5
pref_value
## [1] 7.76
# Preferred dividend in Years 1 to 5
high_div <- 2.5
# Create vector of Year 1-5 dividends
pref_cf <- rep(high_div, 5)
# Convert to data frame
pref_df <- data.frame(pref_cf)
# Add discount periods
pref_df$periods <- 1:5
# Calculate discount factors
pref_df$pv_factor <- (1+kp)**(-pref_df$periods)
# Calculate PV of dividends
pref_df$pv_cf <- pref_df$pref_cf * pref_df$pv_factor
# Calculate value during high stage
pref_value_high <- sum(pref_df$pv_cf)
# Calculate value of the preferred stock
pref_value_high + pref_value
## [1] 17.2
Chapter 3 - Discount Rate and Cost of Capital
What is a discount rate?
Unlevering betas:
Risk-free rate and Equity Risk Premium:
Example code includes:
priceData <- c(21.46, 20.75, 23.44, 23.45, 21.69, 21.67, 21.37, 23.03, 23.57, 24.37, 25.34, 27.18, 27.45, 28.27, 29.61, 28.96, 29.11, 30.48, 31.03, 33.56, 35.34, 38.17, 37.87, 44.13, 43.4, 45.41, 55.57, 48.83, 50.78, 49.84, 51.56, 49.37, 48.6, 45.49, 53.55, 58.61, 56.37, 53.15, 57.33, 59.35, 72.26, 72.63, 67.86, 55.99, 49.59, 40.26, 44.09, 51.3, 54.07, 52.69, 45.07, 46.35, 41.71, 43.34, 43.24, 46.79, 42.36, 38.12, 36.5, 36.61, 38.15, 125.5, 131.32, 137.02, 140.81, 139.87, 131.47, 136.1, 137.71, 141.16, 143.97, 141.35, 142.15, 142.41, 149.7, 151.61, 156.67, 159.68, 163.45, 160.42, 168.71, 163.65, 168.01, 175.79, 181, 184.69, 178.18, 186.29, 187.01, 188.31, 192.68, 195.72, 193.09, 200.71, 197.02, 201.66, 207.2, 205.54, 199.45, 210.66, 206.43, 208.46, 211.14, 205.85, 210.5, 197.67, 191.63, 207.93, 208.69, 203.87, 193.72, 193.56, 205.52, 206.33, 209.84, 209.48, 217.12, 217.38, 216.3, 212.55, 220.38, 223.53)
idxDates <- c(15339, 15370, 15399, 15430, 15460, 15491, 15521, 15552, 15583, 15613, 15644, 15674, 15705, 15736, 15764, 15795, 15825, 15856, 15886, 15917, 15948, 15978, 16009, 16039, 16070, 16101, 16129, 16160, 16190, 16221, 16251, 16282, 16313, 16343, 16374, 16404, 16435, 16466, 16494, 16525, 16555, 16586, 16616, 16647, 16678, 16708, 16739, 16769, 16800, 16831, 16860, 16891, 16921, 16952, 16982, 17013, 17044, 17074, 17105, 17135, 17166)
prices = xts(matrix(priceData, ncol=2, byrow=FALSE), order.by=as.Date(idxDates))
names(prices) <- c("myl_prc", "spy_prc")
str(prices)
## An 'xts' object on 2011-12-31/2016-12-31 containing:
## Data: num [1:61, 1:2] 21.5 20.8 23.4 23.4 21.7 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:2] "myl_prc" "spy_prc"
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## NULL
# Show first six observations of prices
head(prices)
## myl_prc spy_prc
## 2011-12-31 21.5 126
## 2012-01-31 20.8 131
## 2012-02-29 23.4 137
## 2012-03-31 23.4 141
## 2012-04-30 21.7 140
## 2012-05-31 21.7 131
# Calculate MYL monthly return
rets <- quantmod::Delt(prices$myl_prc)
# Calculate SPY monthly return
rets$spy <- quantmod::Delt(prices$spy_prc)
# Change label of first variable
names(rets)[1] <- "myl"
# Remove first observation - NA
rets <- rets[-1, ]
# Run regression
reg <- lm(myl ~ spy, data=rets)
# Save beta
myl_beta <- coef(reg)[2]
myl_beta
## spy
## 1.11
debt_beta <- 0.08
myl_debt_eq <- 1.68
# Calculate the Mylan Unlevered Beta
myl_unl_beta <- (myl_beta + debt_beta * (1 - 0.4) * myl_debt_eq) / (1 + (1 - 0.4) * myl_debt_eq)
myl_unl_beta
## spy
## 0.593
med_beta <- 0.777
debt_eq <- 1.5
# Calculate levered beta
beta <- med_beta * (1 + (1 - 0.4) * debt_eq) - debt_beta * (1 - 0.4) * debt_eq
beta
## [1] 1.4
treas <- Quandl::Quandl(code="FRED/DGS10")
# Review treas
head(treas)
## Date Value
## 1 2019-11-27 1.77
## 2 2019-11-26 1.74
## 3 2019-11-25 1.76
## 4 2019-11-22 1.77
## 5 2019-11-21 1.77
## 6 2019-11-20 1.73
# Extract 2016-12-30 yield
rf <- treas[treas$Date == "2016-12-30", ]
rf
## Date Value
## 728 2016-12-30 2.45
# Keep only the observation in the second column
rf_yield <- rf$Value
rf_yield
## [1] 2.45
# Convert yield to decimal terms
rf_yield_dec <- rf_yield / 100
rf_yield_dec
## [1] 0.0245
year <- 1928:2016
sp500 <- c(0.4381, -0.083, -0.2512, -0.4384, -0.0864, 0.4998, -0.0119, 0.4674, 0.3194, -0.3534, 0.2928, -0.011, -0.1067, -0.1277, 0.1917, 0.2506, 0.1903, 0.3582, -0.0843, 0.052, 0.057, 0.183, 0.3081, 0.2368, 0.1815, -0.0121, 0.5256, 0.326, 0.0744, -0.1046, 0.4372, 0.1206, 0.0034, 0.2664, -0.0881, 0.2261, 0.1642, 0.124, -0.0997, 0.238, 0.1081, -0.0824, 0.0356, 0.1422, 0.1876, -0.1431, -0.259, 0.37, 0.2383, -0.0698, 0.0651, 0.1852, 0.3174, -0.047, 0.2042, 0.2234, 0.0615, 0.3124, 0.1849, 0.0581, 0.1654, 0.3148, -0.0306, 0.3023, 0.0749, 0.0997, 0.0133, 0.372, 0.2268, 0.331, 0.2834, 0.2089, -0.0903, -0.1185, -0.2197, 0.2836, 0.1074, 0.0483, 0.1561, 0.0548, -0.3655, 0.2594, 0.1482, 0.021, 0.1589, 0.3215, 0.1352, 0.0136, 0.1174)
tbond <- c(0.0084, 0.042, 0.0454, -0.0256, 0.0879, 0.0186, 0.0796, 0.0447, 0.0502, 0.0138, 0.0421, 0.0441, 0.054, -0.0202, 0.0229, 0.0249, 0.0258, 0.038, 0.0313, 0.0092, 0.0195, 0.0466, 0.0043, -0.003, 0.0227, 0.0414, 0.0329, -0.0134, -0.0226, 0.068, -0.021, -0.0265, 0.1164, 0.0206, 0.0569, 0.0168, 0.0373, 0.0072, 0.0291, -0.0158, 0.0327, -0.0501, 0.1675, 0.0979, 0.0282, 0.0366, 0.0199, 0.0361, 0.1598, 0.0129, -0.0078, 0.0067, -0.0299, 0.082, 0.3281, 0.032, 0.1373, 0.2571, 0.2428, -0.0496, 0.0822, 0.1769, 0.0624, 0.15, 0.0936, 0.1421, -0.0804, 0.2348, 0.0143, 0.0994, 0.1492, -0.0825, 0.1666, 0.0557, 0.1512, 0.0038, 0.0449, 0.0287, 0.0196, 0.1021, 0.201, -0.1112, 0.0846, 0.1604, 0.0297, -0.091, 0.1075, 0.0128, 0.0069)
damodaran=data.frame(year=year, sp_500=sp500, tbond_10yr=tbond)
str(damodaran)
## 'data.frame': 89 obs. of 3 variables:
## $ year : int 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 ...
## $ sp_500 : num 0.4381 -0.083 -0.2512 -0.4384 -0.0864 ...
## $ tbond_10yr: num 0.0084 0.042 0.0454 -0.0256 0.0879 0.0186 0.0796 0.0447 0.0502 0.0138 ...
# Review the first six rows of damodaran
head(damodaran)
## year sp_500 tbond_10yr
## 1 1928 0.4381 0.0084
## 2 1929 -0.0830 0.0420
## 3 1930 -0.2512 0.0454
## 4 1931 -0.4384 -0.0256
## 5 1932 -0.0864 0.0879
## 6 1933 0.4998 0.0186
# Calculate annual difference between stocks and bonds
diff <- damodaran$sp_500 - damodaran$tbond_10yr
# Calculate ERP
erp <- mean(diff)
erp
## [1] 0.0624
relevered_beta <- 1.404572
rf <- 0.0245
erp <- 0.0623
capm_coe <- rf + relevered_beta * erp
capm_coe
## [1] 0.112
Chapter 4 - Relative Valuation
Relative Valuation:
Valuation Multiples:
Analyzing Determinants of Multiples:
Example code includes:
load("./RInputFiles/midcap400.rda")
str(midcap400)
## 'data.frame': 400 obs. of 8 variables:
## $ ticker : Factor w/ 400 levels "AAN","ABMD","ACC",..: 97 17 1 2 4 6 15 5 9 11 ...
## $ company : Factor w/ 400 levels "3D Systems Corp",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ gics_sector : Factor w/ 11 levels "Consumer Discretionary",..: 7 6 1 5 7 7 7 6 6 5 ...
## $ gics_subindustry: Factor w/ 129 levels "Aerospace & Defense",..: 124 12 62 55 6 6 114 21 22 102 ...
## $ price : num 13.4 47.7 32 113.6 18.3 ...
## $ ltm_eps : num -0.346 1.869 1.925 1.122 1.102 ...
## $ ntm_eps : num 0.47 2 2.41 1.47 0.5 0.64 0 3 2.45 2.25 ...
## $ bvps : num 5.61 8.71 20.75 9.93 6.44 ...
# Review the first six rows of midcap400
head(midcap400)
## ticker company gics_sector
## 1 DDD 3D Systems Corp Information Technology
## 2 AOS A.O. Smith Corp Industrials
## 3 AAN Aaron's Inc Consumer Discretionary
## 4 ABMD Abiomed Health Care
## 5 ACIW ACI Worldwide Inc Information Technology
## 6 ACXM Acxiom Information Technology
## gics_subindustry price ltm_eps ntm_eps bvps
## 1 Technology Hardware, Storage & Peripherals 13.4 -0.346 0.47 5.61
## 2 Building Products 47.7 1.869 2.00 8.71
## 3 Homefurnishing Retail 32.0 1.925 2.41 20.75
## 4 Health Care Equipment 113.6 1.122 1.47 9.93
## 5 Application Software 18.3 1.102 0.50 6.44
## 6 Application Software 26.9 0.136 0.64 9.32
# Subset Pharmaceuticals firms
pharma <- subset(midcap400, gics_subindustry=="Pharmaceuticals")
pharma
## ticker company gics_sector gics_subindustry price
## 10 AKRX Akorn, Inc Health Care Pharmaceuticals 21.1
## 62 CTLT Catalent Inc Health Care Pharmaceuticals 26.7
## 126 ENDP Endo International Health Care Pharmaceuticals 15.7
## 289 PBH Prestige Brands Holdings Inc Health Care Pharmaceuticals 52.1
## ltm_eps ntm_eps bvps
## 10 1.500 2.25 6.54
## 62 0.727 1.43 4.71
## 126 -14.479 5.20 12.12
## 289 1.365 2.43 15.11
# Calculate P/LTM EPS
pharma$ltm_p_e <- ifelse(pharma$ltm_eps > 0, pharma$price / pharma$ltm_eps, NA)
# Calculate P/NTM EPS
pharma$ntm_p_e <- ifelse(pharma$ntm_eps > 0, pharma$price / pharma$ntm_eps, NA)
# Calculate P/BVPS
pharma$p_bv <- ifelse(pharma$bvps > 0, pharma$price / pharma$bvps, NA)
pharma
## ticker company gics_sector gics_subindustry price
## 10 AKRX Akorn, Inc Health Care Pharmaceuticals 21.1
## 62 CTLT Catalent Inc Health Care Pharmaceuticals 26.7
## 126 ENDP Endo International Health Care Pharmaceuticals 15.7
## 289 PBH Prestige Brands Holdings Inc Health Care Pharmaceuticals 52.1
## ltm_eps ntm_eps bvps ltm_p_e ntm_p_e p_bv
## 10 1.500 2.25 6.54 14.1 9.40 3.23
## 62 0.727 1.43 4.71 36.7 18.68 5.67
## 126 -14.479 5.20 12.12 NA 3.02 1.29
## 289 1.365 2.43 15.11 38.2 21.44 3.45
# Calculate average multiples
multiples <- colMeans(pharma[, c("ltm_p_e", "ntm_p_e", "p_bv")], na.rm=TRUE)
multiples
## ltm_p_e ntm_p_e p_bv
## 29.67 13.13 3.41
# Vector of metrics
metrics <- c(1, 2, 8)
# Calculate implied values
implied_val <- multiples*metrics
implied_val
## ltm_p_e ntm_p_e p_bv
## 29.7 26.3 27.3
# Subset Pharmaceuticals firms
cons_disc <- subset(midcap400, gics_sector=="Consumer Discretionary")
cons_disc
## ticker company gics_sector
## 3 AAN Aaron's Inc Consumer Discretionary
## 17 AMCX AMC Networks Consumer Discretionary
## 19 AEO American Eagle Outfitters Consumer Discretionary
## 39 BIG Big Lots Inc Consumer Discretionary
## 45 EAT Brinker International Inc Consumer Discretionary
## 49 BC Brunswick Corp Consumer Discretionary
## 50 BWLD Buffalo Wild Wings Inc Consumer Discretionary
## 51 CAB Cabela's Inc Consumer Discretionary
## 52 CABO Cable One Inc Consumer Discretionary
## 55 CAA CalAtlantic Group Inc Consumer Discretionary
## 60 CRI Carter's, Inc. Consumer Discretionary
## 67 CAKE Cheesecake Factory Inc Consumer Discretionary
## 70 CHS Chico's Fas Inc Consumer Discretionary
## 71 CHDN Churchill Downs Inc Consumer Discretionary
## 73 CNK Cinemark Holdings Inc Consumer Discretionary
## 86 CTB Cooper Tire & Rubber Company Consumer Discretionary
## 92 CBRL Cracker Barrel Consumer Discretionary
## 95 CST CST Brands Inc Consumer Discretionary
## 100 DAN Dana Incorporated Consumer Discretionary
## 103 DECK Deckers Outdoor Corporation Consumer Discretionary
## 106 DV DeVry Education Group Consumer Discretionary
## 108 DKS Dick's Sporting Goods Inc Consumer Discretionary
## 110 DDS Dillard's Inc Consumer Discretionary
## 111 DPZ Domino's Pizza Inc Consumer Discretionary
## 118 DNKN Dunkin' Brands Group Inc Consumer Discretionary
## 144 FOSL Fossil Group Consumer Discretionary
## 147 GME GameStop Corp. Consumer Discretionary
## 151 GNTX Gentex Corp. Consumer Discretionary
## 156 GHC Graham Holdings Company Consumer Discretionary
## 168 HELE Helen of Troy Limited Consumer Discretionary
## 174 HSNI HSN Inc Consumer Discretionary
## 182 ISCA International Speedway Corporation Consumer Discretionary
## 186 JCP J C Penney Company Inc Consumer Discretionary
## 190 JACK Jack in the Box Inc. Consumer Discretionary
## 193 JW.A John Wiley & Sons Inc. 'A' Consumer Discretionary
## 196 KATE Kate Spade & Company Consumer Discretionary
## 197 KBH KB Home Consumer Discretionary
## 220 LYV Live Nation Entertainment Consumer Discretionary
## 234 MDP Meredith Corp Consumer Discretionary
## 243 MUSA Murphy USA Inc Consumer Discretionary
## 253 NYT New York Times Company Consumer Discretionary
## 261 NVR NVR Inc Consumer Discretionary
## 263 ODP Office Depot Inc Consumer Discretionary
## 277 PNRA Panera Bread Co A Consumer Discretionary
## 278 PZZA Papa John’s International Inc Consumer Discretionary
## 284 PII Polaris Industries Inc Consumer Discretionary
## 286 POOL Pool Corp Consumer Discretionary
## 307 SBH Sally Beauty Holdings Inc Consumer Discretionary
## 313 SCI Service Corp Intl Consumer Discretionary
## 317 SKX Skechers USA Inc Consumer Discretionary
## 322 BID Sotheby's Consumer Discretionary
## 340 TPX Tempur Sealy International Consumer Discretionary
## 344 TXRH Texas Roadhouse Consumer Discretionary
## 345 THO Thor Industries Inc Consumer Discretionary
## 346 TIME Time Inc Consumer Discretionary
## 348 TOL Toll Brothers Inc Consumer Discretionary
## 352 TPH TRI Pointe Group Consumer Discretionary
## 356 TUP Tupperware Corp Consumer Discretionary
## 375 VSTO Vista Outdoor Consumer Discretionary
## 386 WEN Wendy's Company Consumer Discretionary
## 394 WSM Williams-Sonoma Inc Consumer Discretionary
## gics_subindustry price ltm_eps ntm_eps bvps
## 3 Homefurnishing Retail 32.05 1.925 2.41 20.75
## 17 Cable & Satellite 52.80 3.770 5.97 -0.43
## 19 Apparel Retail 15.27 1.303 1.31 6.44
## 39 General Merchandise Stores 50.68 3.383 3.78 12.23
## 45 Restaurants 50.03 3.278 3.52 -10.69
## 49 Leisure Products 54.73 3.009 3.86 16.06
## 50 Restaurants 158.20 5.137 6.08 28.46
## 51 Specialty Stores 61.68 2.151 2.99 29.37
## 52 Cable & Satellite 623.11 17.226 20.07 79.56
## 55 Homebuilding 34.36 4.091 3.69 36.26
## 60 Apparel, Accessories & Luxury Goods 88.08 5.130 5.39 15.96
## 67 Restaurants 61.24 2.778 3.05 12.95
## 70 Apparel Retail 14.59 0.423 0.84 4.79
## 71 Casinos & Gaming 150.65 6.530 7.37 42.30
## 73 Movies & Entertainment 38.47 2.198 2.20 10.86
## 86 Tires & Rubber 38.80 4.559 3.96 20.12
## 92 Restaurants 168.35 8.214 8.45 22.76
## 95 Automotive Retail 48.23 4.284 1.79 16.24
## 100 Auto Parts & Equipment 18.97 4.384 1.75 8.04
## 103 Footwear 55.32 -0.071 4.25 30.25
## 106 Education Services 31.25 1.281 2.60 25.63
## 108 Specialty Stores 52.38 2.922 3.49 16.60
## 110 Department Stores 62.47 5.590 5.60 51.98
## 111 Restaurants 162.36 4.140 4.86 -40.16
## 118 Restaurants 53.23 2.136 2.39 -1.78
## 144 Apparel, Accessories & Luxury Goods 25.76 1.638 1.95 20.91
## 147 Computer & Electronics Retail 25.34 3.776 3.79 20.85
## 151 Auto Parts & Equipment 19.96 1.205 1.29 6.67
## 156 Education Services 524.20 29.956 NA 436.93
## 168 Household Appliances 85.35 4.125 6.28 35.15
## 174 Specialty Retail 34.20 2.267 2.52 3.75
## 182 Leisure Facilities 37.45 1.660 1.49 30.68
## 186 Multiline Retail 8.42 -1.047 0.51 3.70
## 190 Restaurants 112.55 3.743 4.70 -6.72
## 193 Publishing 54.30 1.552 3.04 17.06
## 196 Apparel, Accessories & Luxury Goods 18.49 1.184 0.76 3.36
## 197 Homebuilding 16.04 1.226 1.40 20.33
## 220 Movies & Entertainment 26.81 -0.233 0.24 5.53
## 234 Publishing 59.30 2.158 3.54 21.46
## 243 Automotive Retail 62.49 5.640 4.63 18.06
## 253 Publishing 13.40 0.195 0.56 5.26
## 261 Homebuilding 1680.00 110.544 112.17 344.18
## 263 Specialty Stores 4.62 1.260 0.46 3.51
## 277 Restaurants 207.93 6.209 7.36 12.34
## 278 Restaurants 87.28 2.764 2.71 -0.11
## 284 Leisure Products 81.37 3.312 4.82 13.53
## 286 Distributors 105.68 3.557 3.82 4.96
## 307 Specialty Stores 26.43 1.627 1.83 -2.02
## 313 Specialized Consumer Services 28.56 0.917 1.28 5.74
## 317 Footwear 24.65 1.579 1.75 10.16
## 322 Diversified Consumer Services 40.01 1.282 1.85 9.54
## 340 Household Durables 68.18 3.425 4.43 -0.26
## 344 Restaurants 49.03 1.642 1.94 10.64
## 345 Leisure Products 100.73 5.450 6.56 25.21
## 346 Publishing 18.00 -0.484 1.35 14.54
## 348 Homebuilding 31.38 2.271 3.08 26.04
## 352 Homebuilding 11.75 1.213 1.19 11.43
## 356 Housewares & Specialties 52.96 4.426 4.42 4.21
## 375 Leisure Products 37.08 3.012 2.84 28.94
## 386 Restaurants 13.71 0.710 0.42 2.76
## 394 Homefurnishing Retail 48.13 3.387 3.54 13.20
# Calculate ROE
cons_disc$roe <- cons_disc$ltm_eps / cons_disc$bvps
# Calculate Price to Book ratio
cons_disc$p_bv <- ifelse(cons_disc$bvps <= 0, NA, cons_disc$price / cons_disc$bvps)
# Remove NA
cons_disc_no_na <- cons_disc[complete.cases(cons_disc), ]
head(cons_disc_no_na)
## ticker company gics_sector
## 3 AAN Aaron's Inc Consumer Discretionary
## 19 AEO American Eagle Outfitters Consumer Discretionary
## 39 BIG Big Lots Inc Consumer Discretionary
## 49 BC Brunswick Corp Consumer Discretionary
## 50 BWLD Buffalo Wild Wings Inc Consumer Discretionary
## 51 CAB Cabela's Inc Consumer Discretionary
## gics_subindustry price ltm_eps ntm_eps bvps roe p_bv
## 3 Homefurnishing Retail 32.0 1.92 2.41 20.75 0.0928 1.54
## 19 Apparel Retail 15.3 1.30 1.31 6.44 0.2023 2.37
## 39 General Merchandise Stores 50.7 3.38 3.78 12.23 0.2766 4.14
## 49 Leisure Products 54.7 3.01 3.86 16.06 0.1874 3.41
## 50 Restaurants 158.2 5.14 6.08 28.46 0.1805 5.56
## 51 Specialty Stores 61.7 2.15 2.99 29.37 0.0732 2.10
# Set x-axis range
x.range <- c(min(cons_disc_no_na$roe), max(cons_disc_no_na$roe))
# Set y-axis range
y.range <- c(min(cons_disc_no_na$p_bv), max(cons_disc_no_na$p_bv))
# Plot data
plot(y = cons_disc_no_na$p_bv, x = cons_disc_no_na$roe,
xlab = "Return on Equity", ylab = "Price-to-Book",
xlim = x.range, ylim = y.range, col = "blue",
main = "Price-to-Book Value and Return on Equity Of Mid-Cap Consumer Discretionary Firms"
)
# Regress roe on p_bv
reg <- lm(p_bv ~ roe, data = cons_disc_no_na)
# Add trend line in red
abline(reg, col = "red")
# Regression summary
summary_reg <- summary(reg)
summary_reg
##
## Call:
## lm(formula = p_bv ~ roe, data = cons_disc_no_na)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.212 -1.280 -0.453 0.588 10.447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.309 0.537 2.44 0.018 *
## roe 13.317 1.861 7.16 3.4e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.71 on 50 degrees of freedom
## Multiple R-squared: 0.506, Adjusted R-squared: 0.496
## F-statistic: 51.2 on 1 and 50 DF, p-value: 3.43e-09
# Store intercept
a <- coef(reg)[1]
a
## (Intercept)
## 1.31
# Store beta
b <- coef(reg)[2]
b
## roe
## 13.3
# Calculate implied P/B
implied_p_b <- (a + b*0.2)
implied_p_b
## (Intercept)
## 3.97
# Calculate implied price
implied_price <- 8 * implied_p_b
implied_price
## (Intercept)
## 31.8
Chapter 5 - Comprehensive Exercise
Fundamental Valuation: Analyzing Projections:
Fundamental Valuation: Implementation:
Relative Valuation:
Wrap up:
Example code includes:
revenue <- matrix(data=c(81.87, 0, 87.82, 0, 85.95, 0, 89.02, 0, 97.43, 0, 96.78, 0, 96.7, 0, 0, 108.45, 0, 112.67, 0, 120, 0, 127.6, 0, 126.06),
ncol=12, byrow=FALSE, dimnames=list(c("hist_rev", "proj_rev"), 2010:2021)
)
str(revenue)
## num [1:2, 1:12] 81.9 0 87.8 0 86 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:2] "hist_rev" "proj_rev"
## ..$ : chr [1:12] "2010" "2011" "2012" "2013" ...
# Create a bar chart
barplot(revenue, col = c("red", "blue"), main = "Historical vs. Projected Revenues")
# Add legend
legend("topleft", legend = c("Historical", "Projected"), fill = c("red", "blue"))
# Create a data frame of single series
rev_all <- apply(revenue, 2, FUN=sum)
rev_all_df <- data.frame(rev_all)
# Create Trend Variable
rev_all_df$trend <- 1:length(rev_all)
# Create Shift Variable
rev_all_df$shift <- ifelse(rev_all_df$trend <= 7, 0, 1)
# Run regression
reg <- lm(rev_all ~ trend + shift, data = rev_all_df)
summary(reg)
##
## Call:
## lm(formula = rev_all ~ trend + shift, data = rev_all_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.978 -2.087 0.016 1.626 5.380
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77.739 2.517 30.88 1.9e-10 ***
## trend 3.264 0.544 6.00 0.0002 ***
## shift 8.575 3.807 2.25 0.0508 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.35 on 9 degrees of freedom
## Multiple R-squared: 0.964, Adjusted R-squared: 0.956
## F-statistic: 121 on 2 and 9 DF, p-value: 3.14e-07
# Subset Treasury data to 12/30/16
rf <- subset(treas, Date==as.Date("2016-12-30"))
# Keep 2nd column
rf_yield <- rf$Value
# Convert to decimal terms
rf_yield_dec <- rf_yield/100
rf_yield_dec
## [1] 0.0245
# Calcualte difference between S&P 500 Return and Treasury Return
diff <- damodaran$sp_500 - damodaran$tbond_10yr
# Calculate average difference
erp <- mean(diff)
erp
## [1] 0.0624
beta <- 1.4594
# Calculate CAPM Cost of Equity
ke <- rf_yield_dec + beta*erp
ke
## [1] 0.116
fcfe <- data.frame(fcfe=c(14, 15, 16, 17, 17.5))
rownames(fcfe) <- 2017:2021
fcfe
## fcfe
## 2017 14.0
## 2018 15.0
## 2019 16.0
## 2020 17.0
## 2021 17.5
# Calculate Discount Periods to 12/31/2016
fcfe$disc_periods <- seq_len(nrow(fcfe))
# Calculate discount factor
fcfe$disc_factor <- (1 + ke)**(-fcfe$disc_periods)
# Calculate PV of each period's total free cash flow
fcfe$pv <- fcfe$fcfe * fcfe$disc_factor
# Calculate Projection Period Value
pv_proj_period <- sum(fcfe$pv)
pv_proj_period
## [1] 57.2
pgr <- 0.03
# Extract 2021 FCFE
fcfe_2021 <- fcfe[nrow(fcfe), "fcfe"]
# Use perpetuity with growth formula to calculate terminal value
tv_2021 <- fcfe_2021 * (1 + pgr) / (ke - pgr)
tv_2021
## [1] 211
# Calculate PV of Terminal Value
pv_terminal <- tv_2021 * (1 + ke)**(-nrow(fcfe))
pv_terminal
## [1] 122
# Calculate agggregate equity value
equity_value_fcfe <- pv_proj_period + pv_terminal
equity_value_fcfe
## [1] 179
shout <- 15
# Calculate equity value per share
equity_value_fcfe_per_share <- equity_value_fcfe / shout
equity_value_fcfe_per_share
## [1] 12
div <- 1.25
# Use DDM to Calculate Equity Value
equity_value_ddm <- div * (1 + pgr) / (ke - pgr)
equity_value_ddm
## [1] 15.1
# Equity Value Per Share
equity_value_ddm_per_share <- equity_value_ddm / shout
equity_value_ddm_per_share
## [1] 1
p_eps_multiple <- 8
eps <- 1.39
# Calculate Implied Equity Value
equity_value_p_e <- p_eps_multiple * eps
equity_value_p_e
## [1] 11.1
# Calculate Equity Value Per Share
equity_value_p_e_per_share <- equity_value_p_e / shout
equity_value_p_e_per_share
## [1] 0.741
eq_val_fcfe_per_share <- 11.95
eq_val_ddm_per_share <- 11.7
eq_val_p_e_per_share <- 11.12
# Combine equity values
# eq_val <- c(eq_val_fcfe, eq_val_ddm, eq_val_p_e)
# Combine equity values per share
eq_val_per_share <- c(eq_val_fcfe_per_share, eq_val_ddm_per_share, eq_val_p_e_per_share)
# Combine into a summary table
# summary <- rbind(eq_val, eq_val_per_share)
# Rename column headers
# colnames(summary) <- c("DCF", "DDM", "P/E")
# summary
Chapter 1 - Introduction and Data Pre-Processing
Introduction - course is primarily about risks of loan default:
Histograms and Outliers:
Missing data and course classification:
Data splitting and confusion matrices:
Example code includes:
loan_ch1 <- readRDS("./RInputFiles/loan_data_ch1.rds")
loan_ch2 <- readRDS("./RInputFiles/loan_data_ch2.rds")
str(loan_ch1)
## 'data.frame': 29092 obs. of 8 variables:
## $ loan_status : int 0 0 0 0 0 0 1 0 1 0 ...
## $ loan_amnt : int 5000 2400 10000 5000 3000 12000 9000 3000 10000 1000 ...
## $ int_rate : num 10.6 NA 13.5 NA NA ...
## $ grade : Factor w/ 7 levels "A","B","C","D",..: 2 3 3 1 5 2 3 2 2 4 ...
## $ emp_length : int 10 25 13 3 9 11 0 3 3 0 ...
## $ home_ownership: Factor w/ 4 levels "MORTGAGE","OTHER",..: 4 4 4 4 4 3 4 4 4 4 ...
## $ annual_inc : num 24000 12252 49200 36000 48000 ...
## $ age : int 33 31 24 39 24 28 22 22 28 22 ...
str(loan_ch2)
## 'data.frame': 29091 obs. of 8 variables:
## $ loan_status : int 0 0 0 0 0 0 1 0 1 0 ...
## $ loan_amnt : int 5000 2400 10000 5000 3000 12000 9000 3000 10000 1000 ...
## $ grade : Factor w/ 7 levels "A","B","C","D",..: 2 3 3 1 5 2 3 2 2 4 ...
## $ home_ownership: Factor w/ 4 levels "MORTGAGE","OTHER",..: 4 4 4 4 4 3 4 4 4 4 ...
## $ annual_inc : num 24000 12252 49200 36000 48000 ...
## $ age : int 33 31 24 39 24 28 22 22 28 22 ...
## $ emp_cat : Factor w/ 5 levels "0-15","15-30",..: 1 2 1 1 1 1 1 1 1 1 ...
## $ ir_cat : Factor w/ 5 levels "0-8","11-13.5",..: 4 5 2 5 5 2 2 4 4 3 ...
# View the structure of loan_data
loan_data <- loan_ch1
str(loan_data)
## 'data.frame': 29092 obs. of 8 variables:
## $ loan_status : int 0 0 0 0 0 0 1 0 1 0 ...
## $ loan_amnt : int 5000 2400 10000 5000 3000 12000 9000 3000 10000 1000 ...
## $ int_rate : num 10.6 NA 13.5 NA NA ...
## $ grade : Factor w/ 7 levels "A","B","C","D",..: 2 3 3 1 5 2 3 2 2 4 ...
## $ emp_length : int 10 25 13 3 9 11 0 3 3 0 ...
## $ home_ownership: Factor w/ 4 levels "MORTGAGE","OTHER",..: 4 4 4 4 4 3 4 4 4 4 ...
## $ annual_inc : num 24000 12252 49200 36000 48000 ...
## $ age : int 33 31 24 39 24 28 22 22 28 22 ...
# Call CrossTable() on loan_status
gmodels::CrossTable(loan_data$loan_status)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 29092
##
##
## | 0 | 1 |
## |-----------|-----------|
## | 25865 | 3227 |
## | 0.889 | 0.111 |
## |-----------|-----------|
##
##
##
##
# Call CrossTable() on grade and loan_status
gmodels::CrossTable(x=loan_data$grade, y=loan_data$loan_status,
prop.r=TRUE, prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE
)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 29092
##
##
## | loan_data$loan_status
## loan_data$grade | 0 | 1 | Row Total |
## ----------------|-----------|-----------|-----------|
## A | 9084 | 565 | 9649 |
## | 0.941 | 0.059 | 0.332 |
## ----------------|-----------|-----------|-----------|
## B | 8344 | 985 | 9329 |
## | 0.894 | 0.106 | 0.321 |
## ----------------|-----------|-----------|-----------|
## C | 4904 | 844 | 5748 |
## | 0.853 | 0.147 | 0.198 |
## ----------------|-----------|-----------|-----------|
## D | 2651 | 580 | 3231 |
## | 0.820 | 0.180 | 0.111 |
## ----------------|-----------|-----------|-----------|
## E | 692 | 176 | 868 |
## | 0.797 | 0.203 | 0.030 |
## ----------------|-----------|-----------|-----------|
## F | 155 | 56 | 211 |
## | 0.735 | 0.265 | 0.007 |
## ----------------|-----------|-----------|-----------|
## G | 35 | 21 | 56 |
## | 0.625 | 0.375 | 0.002 |
## ----------------|-----------|-----------|-----------|
## Column Total | 25865 | 3227 | 29092 |
## ----------------|-----------|-----------|-----------|
##
##
# Create histogram of loan_amnt: hist_1
hist_1 <- hist(loan_data$loan_amnt)
# Print locations of the breaks in hist_1
hist_1$breaks
## [1] 0 2000 4000 6000 8000 10000 12000 14000 16000 18000 20000 22000
## [13] 24000 26000 28000 30000 32000 34000 36000
# Change number of breaks and add labels: hist_2
hist_2 <- hist(loan_data$loan_amnt, breaks = 200, xlab = "Loan amount",
main = "Histogram of the loan amount"
)
# Plot the age variable
plot(loan_data$age, ylab="Age")
# Save the outlier's index to index_highage
index_highage <- which(loan_data$age > 122)
# Create data set new_data with outlier deleted
new_data <- loan_data[-index_highage, ]
# Make bivariate scatterplot of age and annual income
plot(loan_data$age, loan_data$annual_inc, xlab = "Age", ylab = "Annual Income")
# Look at summary of loan_data
summary(loan_data$int_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5 8 11 11 13 23 2776
# Get indices of missing interest rates: na_index
na_index <- which(is.na(loan_data$int_rate))
# Remove observations with missing interest rates: loan_data_delrow_na
loan_data_delrow_na <- loan_data[-na_index, ]
# Make copy of loan_data
loan_data_delcol_na <- loan_data
# Delete interest rate column from loan_data_delcol_na
loan_data_delcol_na$int_rate <- NULL
# Compute the median of int_rate
median_ir <- median(loan_data$int_rate, na.rm=TRUE)
# Make copy of loan_data
loan_data_replace <- loan_data
# Replace missing interest rates with median
loan_data_replace$int_rate[na_index] <- median_ir
# Check if the NAs are gone
summary(loan_data_replace$int_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.42 8.49 10.99 11.00 13.11 23.22
# Make the necessary replacements in the coarse classification example below
loan_data$ir_cat <- rep(NA, length(loan_data$int_rate))
loan_data$ir_cat[which(loan_data$int_rate <= 8)] <- "0-8"
loan_data$ir_cat[which(loan_data$int_rate > 8 & loan_data$int_rate <= 11)] <- "8-11"
loan_data$ir_cat[which(loan_data$int_rate > 11 & loan_data$int_rate <= 13.5)] <- "11-13.5"
loan_data$ir_cat[which(loan_data$int_rate > 13.5)] <- "13.5+"
loan_data$ir_cat[which(is.na(loan_data$int_rate))] <- "Missing"
loan_data$ir_cat <- as.factor(loan_data$ir_cat)
# Look at your new variable using plot()
plot(loan_data$ir_cat)
# Set seed of 567
set.seed(567)
# Store row numbers for training set: index_train
index_train <- sample(1:nrow(loan_data), (2/3)*nrow(loan_data))
# Create training set: training_set
training_set <- loan_data[index_train, ]
# Create test set: test_set
test_set <- loan_data[-index_train, ]
# Create confusion matrix
# conf_matrix <- table(test_set$loan_status, model_pred)
# Compute classification accuracy
# sum(diag(conf_matrix)) / sum(conf_matrix)
# Compute sensitivity
# conf_matrix[2, 2] / sum(conf_matrix[2, ])
Chapter 2 - Logistic Regression
Logistic Regression Introduction:
Logistic Regression Predictions:
Evaluating regression results:
Wrap up:
Example code includes:
loan_data <- loan_ch2
str(loan_data)
## 'data.frame': 29091 obs. of 8 variables:
## $ loan_status : int 0 0 0 0 0 0 1 0 1 0 ...
## $ loan_amnt : int 5000 2400 10000 5000 3000 12000 9000 3000 10000 1000 ...
## $ grade : Factor w/ 7 levels "A","B","C","D",..: 2 3 3 1 5 2 3 2 2 4 ...
## $ home_ownership: Factor w/ 4 levels "MORTGAGE","OTHER",..: 4 4 4 4 4 3 4 4 4 4 ...
## $ annual_inc : num 24000 12252 49200 36000 48000 ...
## $ age : int 33 31 24 39 24 28 22 22 28 22 ...
## $ emp_cat : Factor w/ 5 levels "0-15","15-30",..: 1 2 1 1 1 1 1 1 1 1 ...
## $ ir_cat : Factor w/ 5 levels "0-8","11-13.5",..: 4 5 2 5 5 2 2 4 4 3 ...
set.seed(1911031341)
idxTrain <- sample(1:nrow(loan_data), round((2/3)*nrow(loan_data)), replace=FALSE)
training_set <- loan_data[idxTrain, !(names(loan_data) %in% c("int_rate"))]
test_set <- loan_data[-idxTrain, !(names(loan_data) %in% c("int_rate"))]
str(training_set)
## 'data.frame': 19394 obs. of 8 variables:
## $ loan_status : int 0 0 0 0 0 0 0 0 0 0 ...
## $ loan_amnt : int 3250 6250 22000 2000 10500 15600 10000 2800 12000 7000 ...
## $ grade : Factor w/ 7 levels "A","B","C","D",..: 4 2 1 2 3 1 2 1 2 5 ...
## $ home_ownership: Factor w/ 4 levels "MORTGAGE","OTHER",..: 4 1 1 1 4 1 1 3 1 4 ...
## $ annual_inc : num 23004 32000 115000 126500 30600 ...
## $ age : int 49 25 28 25 25 26 26 24 23 25 ...
## $ emp_cat : Factor w/ 5 levels "0-15","15-30",..: 1 1 2 1 5 1 1 1 1 1 ...
## $ ir_cat : Factor w/ 5 levels "0-8","11-13.5",..: 3 2 1 5 2 1 2 4 4 3 ...
str(test_set)
## 'data.frame': 9697 obs. of 8 variables:
## $ loan_status : int 1 1 0 0 0 0 0 0 0 0 ...
## $ loan_amnt : int 9000 10000 10000 3600 5000 12400 6000 12000 3000 14000 ...
## $ grade : Factor w/ 7 levels "A","B","C","D",..: 3 2 3 1 1 2 2 1 4 2 ...
## $ home_ownership: Factor w/ 4 levels "MORTGAGE","OTHER",..: 4 4 4 1 4 4 4 4 1 4 ...
## $ annual_inc : num 30000 100000 42000 110000 24044 ...
## $ age : int 22 28 23 27 22 36 25 25 26 26 ...
## $ emp_cat : Factor w/ 5 levels "0-15","15-30",..: 1 1 1 1 1 2 1 1 1 1 ...
## $ ir_cat : Factor w/ 5 levels "0-8","11-13.5",..: 2 4 5 1 4 4 2 1 3 5 ...
# Build a glm model with variable ir_cat as a predictor
log_model_cat <- glm(loan_status ~ ir_cat, data=training_set, family="binomial")
# Print the parameter estimates
log_model_cat
##
## Call: glm(formula = loan_status ~ ir_cat, family = "binomial", data = training_set)
##
## Coefficients:
## (Intercept) ir_cat11-13.5 ir_cat13.5+ ir_cat8-11 ir_catMissing
## -2.909 1.002 1.379 0.604 0.771
##
## Degrees of Freedom: 19393 Total (i.e. Null); 19389 Residual
## Null Deviance: 13400
## Residual Deviance: 13000 AIC: 13000
# Look at the different categories in ir_cat using table()
table(loan_data$ir_cat)
##
## 0-8 11-13.5 13.5+ 8-11 Missing
## 7130 6953 6002 6230 2776
# Build the logistic regression model
log_model_multi <- glm(loan_status ~ age + ir_cat + grade + loan_amnt + annual_inc,
data=training_set, family="binomial"
)
# Obtain significance levels using summary()
summary(log_model_multi)
##
## Call:
## glm(formula = loan_status ~ age + ir_cat + grade + loan_amnt +
## annual_inc, family = "binomial", data = training_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.209 -0.532 -0.435 -0.331 3.355
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.35e+00 1.29e-01 -18.15 < 2e-16 ***
## age -6.39e-03 3.93e-03 -1.63 0.10356
## ir_cat11-13.5 5.43e-01 1.34e-01 4.06 4.9e-05 ***
## ir_cat13.5+ 4.89e-01 1.48e-01 3.30 0.00098 ***
## ir_cat8-11 3.58e-01 1.19e-01 2.99 0.00275 **
## ir_catMissing 3.54e-01 1.31e-01 2.70 0.00684 **
## gradeB 3.29e-01 1.08e-01 3.05 0.00227 **
## gradeC 6.02e-01 1.23e-01 4.90 9.5e-07 ***
## gradeD 9.89e-01 1.38e-01 7.15 8.9e-13 ***
## gradeE 1.20e+00 1.64e-01 7.28 3.4e-13 ***
## gradeF 1.38e+00 2.44e-01 5.65 1.6e-08 ***
## gradeG 2.26e+00 3.65e-01 6.20 5.8e-10 ***
## loan_amnt -5.52e-06 4.20e-06 -1.31 0.18911
## annual_inc -5.49e-06 7.54e-07 -7.28 3.5e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13422 on 19393 degrees of freedom
## Residual deviance: 12845 on 19380 degrees of freedom
## AIC: 12873
##
## Number of Fisher Scoring iterations: 5
# Build the logistic regression model
log_model_small <- glm(formula = loan_status ~ age + ir_cat, family = "binomial", data = training_set)
predictions_all_small <- predict(log_model_small, newdata = test_set, type = "response")
# Look at the range of the object "predictions_all_small"
range(predictions_all_small)
## [1] 0.029 0.190
# Change the code below to construct a logistic regression model using all available predictors in the data set
log_model_full <- glm(loan_status ~ ., family = "binomial", data = training_set)
# Make PD-predictions for all test set elements using the the full logistic regression model
predictions_all_full <- predict(log_model_full, newdata=test_set, type="response")
# Look at the predictions range
range(predictions_all_full)
## [1] 2.77e-06 5.09e-01
# Make a binary predictions-vector using a cut-off of 15%
pred_cutoff_15 <- ifelse(predictions_all_full > .15, 1, 0)
# Construct a confusion matrix
table(test_set$loan_status, pred_cutoff_15)
## pred_cutoff_15
## 0 1
## 0 6914 1685
## 1 721 377
# Fit the logit, probit and cloglog-link logistic regression models
log_model_logit <- glm(loan_status ~ age + emp_cat + ir_cat + loan_amnt,
family = binomial(link = logit), data = training_set
)
log_model_probit <- glm(loan_status ~ age + emp_cat + ir_cat + loan_amnt,
family = binomial(link = probit), data = training_set
)
log_model_cloglog <- glm(loan_status ~ age + emp_cat + ir_cat + loan_amnt,
family = binomial(link = cloglog), data = training_set
)
# Make predictions for all models using the test set
predictions_logit <- predict(log_model_logit, newdata = test_set, type = "response")
predictions_probit <- predict(log_model_probit, newdata = test_set, type = "response")
predictions_cloglog <- predict(log_model_cloglog, newdata = test_set, type = "response")
# Use a cut-off of 14% to make binary predictions-vectors
cutoff <- 0.14
class_pred_logit <- ifelse(predictions_logit > cutoff, 1, 0)
class_pred_probit <- ifelse(predictions_probit > cutoff, 1, 0)
class_pred_cloglog <- ifelse(predictions_cloglog > cutoff, 1, 0)
# Make a confusion matrix for the three models
true_val <- test_set$loan_status
tab_class_logit <- table(true_val,class_pred_logit)
tab_class_probit <- table(true_val,class_pred_probit)
tab_class_cloglog <- table(true_val,class_pred_cloglog)
# Compute the classification accuracy for all three models
acc_logit <- sum(diag(tab_class_logit)) / nrow(test_set)
acc_probit <- sum(diag(tab_class_probit)) / nrow(test_set)
acc_cloglog <- sum(diag(tab_class_cloglog)) / nrow(test_set)
Chapter 3 - Decision Trees
Background - decision trees tend to be popular due to the ease of interpretation:
Building decision trees using rpart:
Pruning decision trees:
Other tree options and confusion matrices:
Example code includes:
# As a small reminder, remember that Gini of a certain node = 2 * proportion of defaults in this node * proportion of non-defaults in this node. Have a look at the code for a refresher.
# gini_root <- 2 * (89 / 500) * (411 / 500)
# Look at the following code to get an idea of how you can use the gini measures you created to calculate the gain of a node.
# Gain = gini_root - (prop(cases left leaf) * gini_left) - (prop(cases right leaf * gini_right))
ssStatus <- c(0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0)
ssAge <- c(48, 36, 35, 33, 46, 24, 35, 33, 41, 49, 47, 44, 54, 42, 26, 50, 41, 30, 17, 34, 37, 41, 41, 33, 39, 40, 40, 55, 45, 45, 45, 49, 47, 29, 42, 42, 60, 40, 52, 50, 45, 39, 38, 41, 37, 43, 49, 54, 50, 55, 18, 44, 43, 36, 49, 30, 51, 38, 42, 38, 47, 39, 57, 56, 33, 51, 45, 53, 48, 37, 40, 51, 23, 22, 52, 57, 41, 33, 42, 43, 45, 26, 57, 40, 37, 40, 54, 40, 43, 45, 43, 45, 34, 57, 53, 46, 52, 35, 27, 39, 47, 23, 50, 43, 40, 56, 40, 44, 44, 42, 41, 43, 31, 35, 48, 50, 43, 29, 36, 35, 53, 37, 51, 39, 43, 60, 51, 28, 25, 50, 46, 37, 55, 43, 28, 49, 29, 53, 48, 51, 33, 35, 41, 27, 39, 47, 33, 53, 42, 56, 46, 46, 43, 52, 48, 33, 47, 55, 36, 56, 33, 44, 29, 47, 43, 54, 47, 54, 29, 22, 32, 34, 35, 36, 52, 32, 39, 57, 47, 40, 29, 51, 54, 48, 34, 35, 51, 38, 38, 31, 47, 41, 60, 42, 43, 49, 41, 35, 44, 49, 36, 46, 45, 43, 43, 43, 48, 37, 40, 48, 48, 48, 54, 37, 39, 41, 49, 47, 42, 42, 56, 49, 48, 51, 16, 43, 38, 48, 54, 47, 39, 51, 44, 46, 48, 37, 41, 33, 45, 43, 31, 44, 36, 44, 36, 12, 48, 40, 47, 38, 36, 52, 41, 36, 33, 45, 58, 44, 54, 37, 56, 29, 34, 44, 46, 46, 47, 40, 52, 36, 44, 49, 61, 55, 28, 39, 45, 40, 57, 41, 53, 39, 45, 33, 43, 38, 59, 42, 22, 47, 40, 44, 31, 13, 58, 43, 53, 45, 45, 51, 41, 51, 47, 45, 12, 46, 55, 31, 57, 41, 22, 33, 56, 36, 42, 38, 41, 49, 46, 41, 39, 51, 45, 48, 44, 47, 43, 24, 36, 51, 45, 40, 32, 54, 41, 37, 41, 40, 38, 28, 33, 47, 42, 45, 48, 52, 40, 45, 40, 30, 44, 49, 50, 32, 46, 40, 54, 50, 44, 48, 26, 41, 43, 47, 52, 47, 48, 53, 42, 50, 47, 43, 36, 41, 50, 41, 41, 35, 46, 45, 59, 15, 37, 41, 47, 56, 36, 41, 21, 27, 25, 53, 48, 50, 47, 53, 49, 57, 40, 43, 47, 52, 52, 44, 52, 44, 37, 61, 42, 37, 41, 50, 31, 40, 50, 43, 39, 44, 49, 49, 43, 54, 41, 43, 44, 47, 33, 47, 52, 66, 54, 53, 57, 38, 44, 45, 23, 52, 38, 36, 49, 38, 53, 45, 60, 45, 46, 41, 23, 41, 38, 26, 46, 37, 42, 42, 36, 46, 55, 50, 47, 41, 54, 44, 49, 35, 38, 43, 33, 39, 37, 41, 45, 38, 45, 25, 33, 46, 43, 38, 43, 45, 34, 42, 37, 55, 44, 50, 33, 40, 61, 18, 46, 41, 43, 38, 21, 35, 45, 55)
small_set <- data.frame(status=ssStatus, age=ssAge)
str(small_set)
## 'data.frame': 500 obs. of 2 variables:
## $ status: num 0 0 0 0 0 1 0 1 0 0 ...
## $ age : num 48 36 35 33 46 24 35 33 41 49 ...
small_tree <- rpart::rpart(formula = status ~ age, data = small_set, method = "class",
control = rpart::rpart.control(minsplit = 5, cp = 0.001, maxdepth = 1)
)
# The Gini-measure of the root node is given below
gini_root <- 2 * 89 / 500 * 411 / 500
# Compute the Gini measure for the left leaf node
gini_ll <- 2 * (401/446) * (45/446)
# Compute the Gini measure for the right leaf node
gini_rl <- 2 * (10/54) * (44/54)
# Compute the gain
gain <- gini_root - 446 / 500 * gini_ll - 54 / 500 * gini_rl
# compare the gain-column in small_tree$splits with our computed gain, multiplied by 500, and assure they are the same
small_tree$splits
## count ncat improve index adj
## age 500 1 49.1 32.5 0
improve <- gain * 500
set.seed(1911041357)
usDef <- training_set %>%
filter(loan_status==1)
usOK <- training_set %>%
filter(loan_status==0) %>%
sample_n(size=2*nrow(usDef))
undersampled_training_set <- rbind(usDef, usOK)
str(undersampled_training_set)
## 'data.frame': 6387 obs. of 8 variables:
## $ loan_status : int 1 1 1 1 1 1 1 1 1 1 ...
## $ loan_amnt : int 3000 3000 13800 2400 5000 20000 8000 2500 12000 11225 ...
## $ grade : Factor w/ 7 levels "A","B","C","D",..: 2 2 3 4 1 2 2 2 4 4 ...
## $ home_ownership: Factor w/ 4 levels "MORTGAGE","OTHER",..: 4 4 4 4 4 4 1 4 1 1 ...
## $ annual_inc : num 34580 41000 33036 30000 60000 ...
## $ age : int 31 28 33 21 37 45 22 22 31 22 ...
## $ emp_cat : Factor w/ 5 levels "0-15","15-30",..: 1 1 1 1 2 1 1 1 1 1 ...
## $ ir_cat : Factor w/ 5 levels "0-8","11-13.5",..: 2 2 2 3 1 2 4 4 3 5 ...
# Change the code provided in the video such that a decision tree is constructed using the undersampled training set. Include rpart.control to relax the complexity parameter to 0.001.
tree_undersample <- rpart::rpart(loan_status ~ ., method = "class", data = undersampled_training_set,
control = rpart::rpart.control(cp = 0.001)
)
# Plot the decision tree
plot(tree_undersample, uniform=TRUE)
# Add labels to the decision tree
text(tree_undersample)
# Change the code below such that a tree is constructed with adjusted prior probabilities.
tree_prior <- rpart::rpart(loan_status ~ ., method = "class", data = training_set,
parms = list(prior=c(0.7, 0.3)), control = rpart::rpart.control(cp = 0.001)
)
# Plot the decision tree
plot(tree_prior, uniform=TRUE)
# Add labels to the decision tree
text(tree_prior)
# Change the code below such that a decision tree is constructed using a loss matrix penalizing 10 times more heavily for misclassified defaults.
tree_loss_matrix <- rpart::rpart(loan_status ~ ., method = "class", data = training_set,
parms = list(loss = matrix(c(0, 10, 1, 0), ncol=2)),
control = rpart::rpart.control(cp = 0.001)
)
# Plot the decision tree
plot(tree_loss_matrix, uniform=TRUE)
# Add labels to the decision tree
text(tree_loss_matrix)
# Plot the cross-validated error rate as a function of the complexity parameter
rpart::plotcp(tree_prior)
# Use printcp() to identify for which complexity parameter the cross-validated error rate is minimized.
rpart::printcp(tree_prior)
##
## Classification tree:
## rpart::rpart(formula = loan_status ~ ., data = training_set,
## method = "class", parms = list(prior = c(0.7, 0.3)), control = rpart::rpart.control(cp = 0.001))
##
## Variables actually used in tree construction:
## [1] age annual_inc emp_cat grade home_ownership
## [6] ir_cat loan_amnt
##
## Root node error: 5818/19394 = 0.3
##
## n= 19394
##
## CP nsplit rel error xerror xstd
## 1 0.004 0 1.0 1 0.02
## 2 0.004 5 1.0 1 0.02
## 3 0.004 6 1.0 1 0.02
## 4 0.002 7 1.0 1 0.02
## 5 0.002 11 1.0 1 0.02
## 6 0.002 12 1.0 1 0.02
## 7 0.001 17 0.9 1 0.02
## 8 0.001 18 0.9 1 0.02
## 9 0.001 21 0.9 1 0.02
## 10 0.001 26 0.9 1 0.02
## 11 0.001 27 0.9 1 0.02
## 12 0.001 29 0.9 1 0.02
## 13 0.001 36 0.9 1 0.02
## 14 0.001 38 0.9 1 0.02
## 15 0.001 41 0.9 1 0.02
## 16 0.001 43 0.9 1 0.02
## 17 0.001 46 0.9 1 0.02
## 18 0.001 48 0.9 1 0.02
## 19 0.001 50 0.9 1 0.02
## 20 0.001 52 0.9 1 0.02
# Create an index for of the row with the minimum xerror
index <- which.min(tree_prior$cptable[ , "xerror"])
# Create tree_min
tree_min <- tree_prior$cptable[index, "CP"]
# Prune the tree using tree_min
ptree_prior <- rpart::prune(tree_prior, cp = tree_min)
# Use prp() to plot the pruned tree
rpart.plot::prp(ptree_prior)
# set a seed and run the code to construct the tree with the loss matrix again
set.seed(345)
tree_loss_matrix <- rpart::rpart(loan_status ~ ., method = "class", data = training_set,
parms = list(loss=matrix(c(0, 10, 1, 0), ncol = 2)),
control = rpart::rpart.control(cp = 0.001)
)
# Plot the cross-validated error rate as a function of the complexity parameter
rpart::plotcp(tree_loss_matrix)
# Prune the tree using cp = 0.0012788
ptree_loss_matrix <- rpart::prune(tree_loss_matrix, cp = 0.0012788)
# Use prp() and argument extra = 1 to plot the pruned tree
rpart.plot::prp(ptree_loss_matrix, extra=1)
# set a seed and run the code to obtain a tree using weights, minsplit and minbucket
set.seed(345)
case_weights <- ifelse(training_set$loan_status==0, 1, 3)
tree_weights <- rpart::rpart(loan_status ~ ., method = "class",
data = training_set, weights = case_weights,
control = rpart::rpart.control(minsplit = 5, minbucket = 2, cp = 0.001)
)
# Plot the cross-validated error rate for a changing cp
rpart::plotcp(tree_weights)
# Create an index for of the row with the minimum xerror
# index <- which.min(tree_weights$cp[ , "xerror"])
index <- 1 + tree_weights$cp %>%
as.data.frame() %>%
slice(2:nrow(.)) %>%
pull(xerror) %>%
which.min()
# Create tree_min
tree_min <- tree_weights$cp[index, "CP"]
# Prune the tree using tree_min
ptree_weights <- rpart::prune(tree_weights, cp=tree_min)
# Plot the pruned tree using the rpart.plot()-package
rpart.plot::prp(ptree_weights, extra=1)
ptree_undersample <- rpart::rpart(formula = loan_status ~ ., data = undersampled_training_set,
method = "class", control = rpart::rpart.control(cp = 0.001)
)
# Make predictions for each of the pruned trees using the test set.
pred_undersample <- predict(ptree_undersample, newdata = test_set, type = "class")
pred_prior <- predict(ptree_prior, newdata = test_set, type = "class")
pred_loss_matrix <- predict(ptree_loss_matrix, newdata = test_set, type = "class")
pred_weights <- predict(ptree_weights, newdata = test_set, type = "class")
# construct confusion matrices using the predictions.
confmat_undersample <- table(test_set$loan_status, pred_undersample)
confmat_prior <- table(test_set$loan_status, pred_prior)
confmat_loss_matrix <- table(test_set$loan_status, pred_loss_matrix)
confmat_weights <- table(test_set$loan_status, pred_weights)
# Compute the accuracies
(acc_undersample <- sum(diag(confmat_undersample)) / nrow(test_set))
## [1] 0.797
(acc_prior <- sum(diag(confmat_prior)) / nrow(test_set))
## [1] 0.864
(acc_loss_matrix <- sum(diag(confmat_loss_matrix)) / nrow(test_set))
## [1] 0.472
(acc_weights <- sum(diag(confmat_weights)) / nrow(test_set))
## [1] 0.871
Chapter 4 - Evaluating Credit Risk Models
Right cut-offs (strategy curves):
ROC curves:
Input selection based on AUC:
Wrap up:
Example code includes:
# Make predictions for the probability of default using the pruned tree and the test set.
prob_default_prior <- predict(ptree_prior, newdata = test_set)[ ,2]
# Obtain the cutoff for acceptance rate 80%
cutoff_prior <- quantile(prob_default_prior, 0.8)
# Obtain the binary predictions.
bin_pred_prior_80 <- ifelse(prob_default_prior > cutoff_prior, 1, 0)
# Obtain the actual default status for the accepted loans
accepted_status_prior_80 <- test_set$loan_status[bin_pred_prior_80 == 0]
# Obtain the bad rate for the accepted loans
mean(accepted_status_prior_80)
## [1] 0.105
# Have a look at the function strategy_bank
strategy_bank <- function(prob_of_def) {
cutoff <- rep(NA, 21)
bad_rate <- rep(NA, 21)
accept_rate <- seq(1,0,by=-0.05)
for (i in 1:21) {
cutoff[i] <- quantile(prob_of_def,accept_rate[i])
pred_i <- ifelse(prob_of_def> cutoff[i], 1, 0)
pred_as_good <- test_set$loan_status[pred_i==0]
bad_rate[i] <- sum(pred_as_good) / length(pred_as_good)
}
table <- cbind(accept_rate, cutoff=round(cutoff,4), bad_rate=round(bad_rate,4))
return(list(table=table, bad_rate=bad_rate, accept_rate=accept_rate, cutoff=cutoff))
}
predictions_loss_matrix <- predict(ptree_loss_matrix, newdata=test_set, type = "prob")[, 2]
# Apply the function strategy_bank to both predictions_cloglog and predictions_loss_matrix
strategy_cloglog <- strategy_bank(predictions_cloglog)
strategy_loss_matrix <- strategy_bank(predictions_loss_matrix)
# Obtain the strategy tables for both prediction-vectors
strategy_cloglog$table
## accept_rate cutoff bad_rate
## [1,] 1.00 0.4294 0.1132
## [2,] 0.95 0.1901 0.1086
## [3,] 0.90 0.1802 0.1052
## [4,] 0.85 0.1688 0.1013
## [5,] 0.80 0.1508 0.0977
## [6,] 0.75 0.1384 0.0944
## [7,] 0.70 0.1325 0.0896
## [8,] 0.65 0.1265 0.0867
## [9,] 0.60 0.1188 0.0823
## [10,] 0.55 0.1105 0.0795
## [11,] 0.50 0.1036 0.0775
## [12,] 0.45 0.0978 0.0738
## [13,] 0.40 0.0940 0.0691
## [14,] 0.35 0.0902 0.0636
## [15,] 0.30 0.0855 0.0615
## [16,] 0.25 0.0756 0.0548
## [17,] 0.20 0.0539 0.0484
## [18,] 0.15 0.0516 0.0460
## [19,] 0.10 0.0494 0.0502
## [20,] 0.05 0.0465 0.0489
## [21,] 0.00 0.0216 0.0000
strategy_loss_matrix$table
## accept_rate cutoff bad_rate
## [1,] 1.00 0.2414 0.1132
## [2,] 0.95 0.1663 0.1124
## [3,] 0.90 0.1663 0.1124
## [4,] 0.85 0.1663 0.1124
## [5,] 0.80 0.1663 0.1124
## [6,] 0.75 0.1663 0.1124
## [7,] 0.70 0.1663 0.1124
## [8,] 0.65 0.1663 0.1124
## [9,] 0.60 0.1343 0.0834
## [10,] 0.55 0.1236 0.0727
## [11,] 0.50 0.1141 0.0718
## [12,] 0.45 0.1141 0.0718
## [13,] 0.40 0.0717 0.0671
## [14,] 0.35 0.0717 0.0671
## [15,] 0.30 0.0711 0.0611
## [16,] 0.25 0.0615 0.0576
## [17,] 0.20 0.0615 0.0576
## [18,] 0.15 0.0345 0.0474
## [19,] 0.10 0.0345 0.0474
## [20,] 0.05 0.0345 0.0474
## [21,] 0.00 0.0000 0.2222
# Plot the strategy functions
par(mfrow = c(1,2))
plot(strategy_cloglog$accept_rate, strategy_cloglog$bad_rate,
type = "l", xlab = "Acceptance rate", ylab = "Bad rate",
lwd = 2, main = "logistic regression", ylim=c(0, 0.15)
)
plot(strategy_loss_matrix$accept_rate, strategy_loss_matrix$bad_rate,
type = "l", xlab = "Acceptance rate",
ylab = "Bad rate", lwd = 2, main = "tree", ylim=c(0, 0.15)
)
par(mfrow = c(1,1))
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:BiocGenerics':
##
## var
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Construct the objects containing ROC-information
ROC_logit <- roc(test_set$loan_status, predictions_logit)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
ROC_probit <- roc(test_set$loan_status, predictions_probit)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
ROC_cloglog <- roc(test_set$loan_status, predictions_cloglog)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
ROC_all_full <- roc(test_set$loan_status, predictions_all_full)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Draw all ROCs on one plot
plot.roc(ROC_logit)
lines(ROC_probit, col="blue")
lines(ROC_cloglog, col="red")
lines(ROC_all_full, col="green")
# Compute the AUCs
pROC::auc(ROC_logit)
## Area under the curve: 0.626
pROC::auc(ROC_probit)
## Area under the curve: 0.626
pROC::auc(ROC_cloglog)
## Area under the curve: 0.626
pROC::auc(ROC_all_full)
## Area under the curve: 0.644
predictions_undersample <- predict(ptree_undersample, newdata=test_set, type = "prob")[, 2]
predictions_prior <- predict(ptree_prior, newdata=test_set, type = "prob")[, 2]
predictions_loss_matrix <- predict(ptree_loss_matrix, newdata=test_set, type = "prob")[, 2]
predictions_weights <- predict(ptree_weights, newdata=test_set, type = "prob")[, 2]
# Construct the objects containing ROC-information
ROC_undersample <- roc(test_set$loan_status, predictions_undersample)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
ROC_prior <- roc(test_set$loan_status, predictions_prior)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
ROC_loss_matrix <- roc(test_set$loan_status, predictions_loss_matrix)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
ROC_weights <- roc(test_set$loan_status, predictions_weights)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Draw the ROC-curves in one plot
plot.roc(ROC_undersample)
lines(ROC_prior, col="blue")
lines(ROC_loss_matrix, col="red")
lines(ROC_weights, col="green")
# Compute the AUCs
auc(ROC_undersample)
## Area under the curve: 0.616
auc(ROC_prior)
## Area under the curve: 0.596
auc(ROC_loss_matrix)
## Area under the curve: 0.627
auc(ROC_weights)
## Area under the curve: 0.596
# Build four models each time deleting one variable in log_3_remove_ir
log_4_remove_amnt <- glm(loan_status ~ grade + annual_inc + emp_cat,
family = binomial, data = training_set
)
log_4_remove_grade <- glm(loan_status ~ loan_amnt + annual_inc + emp_cat,
family = binomial, data = training_set
)
log_4_remove_inc <- glm(loan_status ~ loan_amnt + grade + emp_cat,
family = binomial, data = training_set
)
log_4_remove_emp <- glm(loan_status ~ loan_amnt + grade + annual_inc,
family = binomial, data = training_set
)
# Make PD-predictions for each of the models
pred_4_remove_amnt <- predict(log_4_remove_amnt, newdata = test_set, type = "response")
pred_4_remove_grade <- predict(log_4_remove_grade, newdata = test_set, type = "response")
pred_4_remove_inc <- predict(log_4_remove_inc, newdata = test_set, type = "response")
pred_4_remove_emp <- predict(log_4_remove_emp, newdata = test_set, type = "response")
# Compute the AUCs
auc(test_set$loan_status, pred_4_remove_amnt)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Area under the curve: 0.641
auc(test_set$loan_status, pred_4_remove_grade)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Area under the curve: 0.585
auc(test_set$loan_status, pred_4_remove_inc)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Area under the curve: 0.628
auc(test_set$loan_status, pred_4_remove_emp)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Area under the curve: 0.64
# Build three models each time deleting one variable in log_4_remove_amnt
log_5_remove_grade <- glm(loan_status ~ annual_inc + emp_cat, family = binomial, data = training_set)
log_5_remove_inc <- glm(loan_status ~ grade + emp_cat, family = binomial, data = training_set)
log_5_remove_emp <- glm(loan_status ~ grade + annual_inc, family = binomial, data = training_set)
# Make PD-predictions for each of the models
pred_5_remove_grade <- predict(log_5_remove_grade, newdata = test_set, type = "response")
pred_5_remove_inc <- predict(log_5_remove_inc, newdata = test_set, type = "response")
pred_5_remove_emp <- predict(log_5_remove_emp, newdata = test_set, type = "response")
# Compute the AUCs
auc(test_set$loan_status, pred_5_remove_grade)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Area under the curve: 0.581
auc(test_set$loan_status, pred_5_remove_inc)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Area under the curve: 0.623
auc(test_set$loan_status, pred_5_remove_emp)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Area under the curve: 0.641
# Plot the ROC-curve for the best model here
plot.roc(roc(test_set$loan_status, pred_4_remove_amnt))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
Chapter 1 - Trading Basics
Rationales for trading:
Pitfalls of trading systems:
Getting financial data:
Adding indicators to financial data:
Example code includes:
# Get SPY from yahoo
quantmod::getSymbols("SPY", from = "2000-01-01", to = "2016-06-30",
src = "yahoo", adjust = TRUE, auto.assign=TRUE
)
# Plot the closing price of SPY
plot(quantmod::Cl(SPY))
# Add a 200-day SMA using lines()
lines(TTR::SMA(quantmod::Cl(SPY), n = 200), col = "red")
Chapter 2 - Boilerplate quantstrat strategies
Setting up a strategy - I:
Setting up a strategy - II:
Example code includes:
# DO Not RUN - do not have package 'quantstrat'
# Load the quantstrat package (not available on CRA)
library(quantstrat)
# Create initdate, from, and to strings
initdate <- "1999-01-01"
from <- "2003-01-01"
to <- "2015-12-31"
# Set the timezone to UTC (do not want to mess up my machine with this . . . )
# Sys.setenv(TZ="UTC")
# Set the currency to USD
currency("USD")
# Load the quantmod package
library(quantmod)
# Retrieve SPY from yahoo
getSymbols("SPY", src="yahoo", from=from, to=to, adjust=TRUE)
# Use stock() to initialize SPY and set currency to USD
stock("SPY", currency="USD")
# Define your trade size and initial equity
tradesize <- 100000L
initeq <- 100000L
# Define the names of your strategy, portfolio and account
strategy.st <- "firststrat"
portfolio.st <- "firststrat"
account.st <- "firststrat"
# Remove the existing strategy if it exists
rm.strat(strategy.st)
# Initialize the portfolio
initPortf(portfolio.st, symbols = "SPY", initDate = initdate, currency = "USD")
# Initialize the account
initAcct(account.st, portfolios = portfolio.st, initDate = initdate, currency = "USD", initEq = initeq)
# Initialize the orders
initOrders(portfolio.st, initDate = initdate)
# Store the strategy
strategy(strategy.st, store = TRUE)
Chapter 3 - Indicators
Introduction to indicators:
Indicator mechanic:
Indicator structure review:
Example code includes:
# DO Not RUN - do not have package 'quantstrat'
# Create a 200-day SMA
spy_sma <- TTR::SMA(x=quantmod::Cl(SPY), n=200)
# Create an RSI with a 3-day lookback period
spy_rsi <- TTR::RSI(price=quantmod::Cl(SPY), n=3)
# Plot the closing prices of SPY
plot(quantmod::Cl(SPY))
# Overlay a 200-day SMA
lines(TTR::SMA(quantmod::Cl(SPY), n = 200), col = "red")
# What kind of indicator?
"trend"
# Plot the closing price of SPY
plot(quantmod::Cl(SPY))
# Plot the RSI 2
plot(TTR::RSI(quantmod::Cl(SPY), n = 2))
# What kind of indicator?
"reversion"
# Add a 200-day SMA indicator to strategy.st
add.indicator(strategy = strategy.st, name = "SMA",
arguments = list(x=quote(quantmod::Cl(mktdata)), n=200), label = "SMA200"
)
# Add a 50-day SMA indicator to strategy.st
add.indicator(strategy = strategy.st, name = "SMA",
arguments = list(x=quote(qunatmod::Cl(mktdata)), n=50), label = "SMA50"
)
# Add an RSI 3 indicator to strategy.st
add.indicator(strategy = strategy.st, name = "RSI",
arguments = list(x=quote(quantmod::Cl(mktdata)), n=3), label = "RSI_3"
)
# Write the RSI_avg function
RSI_avg <- function(price, n1, n2) {
# RSI 1 takes an input of the price and n1
rsi_1 <- RSI(price = price, n = n1)
# RSI 2 takes an input of the price and n2
rsi_2 <- RSI(price = price, n = n2)
# RSI_avg is the average of rsi_1 and rsi_2
RSI_avg <- (rsi_1 + rsi_2)/2
# Your output of RSI_avg needs a column name of RSI_avg
colnames(RSI_avg) <- "RSI_avg"
return(RSI_avg)
}
# Add this function as RSI_3_4 to your strategy with n1 = 3 and n2 = 4
add.indicator(strategy.st, name = "RSI_avg",
arguments = list(price = quote(Cl(mktdata)), n1 = 3, n2 = 4), label = "RSI_3_4"
)
# While the RSI is decent, it is somewhat outdated as far as indicators go
# In this exercise, you will code another indicator from scratch
# The indicator is called the David Varadi Oscillator (DVO), originated by David Varadi, a quantitative research director
# The version you will implement is a simplified version.
# The purpose of this oscillator is similar to something like the RSI in that it attempts to find opportunities to buy a temporary dip and sell in a temporary uptrend
# In addition to obligatory market data, an oscillator function takes in two lookback periods
# First, the function computes a ratio between the closing price and average of high and low prices
# Next, it applies an SMA to that quantity to smooth out noise, usually on a very small time frame, such as two days
# Finally, it uses the runPercentRank() function to take a running percentage rank of this average ratio, and multiplies it by 100 to convert it to a 0-100 quantity
# Declare the DVO function
DVO <- function(HLC, navg = 2, percentlookback = 126) {
# Compute the ratio between closing prices to the average of high and low
ratio <- Cl(HLC)/((Hi(HLC) + Lo(HLC))/2)
# Smooth out the ratio outputs using a moving average
avgratio <- SMA(ratio, n = navg)
# Convert ratio into a 0-100 value using runPercentRank()
out <- runPercentRank(avgratio, n = percentlookback, exact.multiplier = 1) * 100
colnames(out) <- "DVO"
return(out)
}
# Add the DVO indicator to your strategy
add.indicator(strategy = strategy.st, name = "DVO",
arguments = list(HLC = quote(HLC(mktdata)), navg = 2, percentlookback = 126),
label = "DVO_2_126"
)
# Use applyIndicators to test out your indicators
test <- applyIndicators(strategy = strategy.st, mktdata = OHLC(SPY))
# Subset your data between Sep. 1 and Sep. 5 of 2013
test_subset <- test["2013-09-01/2013-09-05"]
Chapter 4 - Signals
Introduction to signals:
sigComparison and sigCrossover:
sigThreshold:
sigFormula:
Example code includes:
# DO Not RUN - do not have package 'quantstrat'
# Add a sigComparison which specifies that SMA50 must be greater than SMA200, call it longfilter
add.signal(strategy.st, name = "sigComparison",
arguments = list(columns = c("SMA50", "SMA200"), relationship = "gt"), label = "longfilter"
)
# Add a sigCrossover which specifies that the SMA50 is less than the SMA200 and label it filterexit
add.signal(strategy.st, name = "sigCrossover",
arguments = list(columns = c("SMA50", "SMA200"), relationship = "lt"), label = "filterexit"
)
# Implement a sigThreshold which specifies that DVO_2_126 must be less than 20, label it longthreshold
add.signal(strategy.st, name = "sigThreshold",
arguments = list(column = "DVO_2_126", threshold = 20, relationship = "lt", cross = FALSE),
label = "longthreshold"
)
# Add a sigThreshold signal to your strategy that specifies that DVO_2_126 must cross above 80 and label it thresholdexit
add.signal(strategy.st, name = "sigThreshold",
arguments = list(column = "DVO_2_126", threshold = 80, relationship = "gt", cross = TRUE),
label = "thresholdexit"
)
# Create your dataset: test
test_init <- applyIndicators(strategy.st, mktdata = OHLC(SPY))
test <- applySignals(strategy = strategy.st, mktdata = test_init)
# Add a sigFormula signal to your code specifying that both longfilter and longthreshold must be TRUE, label it longentry
add.signal(strategy.st, name = "sigFormula",
arguments = list(formula = "longfilter & longthreshold", cross = TRUE), label = "longentry"
)
Chapter 5 - Rules
Introduction to rules:
More rule mechanics:
More rule mechanics - II:
Order sizing functions:
Example code includes:
# DO Not RUN - do not have package 'quantstrat'
# Fill in the rule's type as exit
add.rule(strategy.st, name = "ruleSignal",
arguments = list(sigcol = "filterexit", sigval = TRUE, orderqty = "all",
ordertype = "market", orderside = "long", replace = FALSE, prefer = "Open"
),
type = "exit"
)
# Fill in the sigcol argument in add.rule()
add.rule(strategy.st, name = "ruleSignal",
arguments = list(sigcol = "filterexit", sigval = TRUE, orderqty = "all",
ordertype = "market", orderside = "long", replace = FALSE, prefer = "Open"
),
type = "exit"
)
# Fill in the sigval argument in add.rule()
add.rule(strategy.st, name = "ruleSignal",
arguments = list(sigcol = "filterexit", sigval = TRUE, orderqty = "all",
ordertype = "market", orderside = "long", replace = FALSE, prefer = "Open"
),
type = "exit"
)
# Fill in the orderqty argument in add.rule()
add.rule(strategy.st, name = "ruleSignal",
arguments = list(sigcol = "filterexit", sigval = TRUE, orderqty = "all",
ordertype = "market", orderside = "long", replace = FALSE, prefer = "Open"
),
type = "exit"
)
# Fill in the ordertype argument in add.rule()
add.rule(strategy.st, name = "ruleSignal",
arguments = list(sigcol = "filterexit", sigval = TRUE, orderqty = "all",
ordertype = "market", orderside = "long", replace = FALSE, prefer = "Open"
),
type = "exit"
)
# Fill in the orderside argument in add.rule()
add.rule(strategy.st, name = "ruleSignal",
arguments = list(sigcol = "filterexit", sigval = TRUE, orderqty = "all",
ordertype = "market", orderside = "long", replace = FALSE, prefer = "Open"
),
type = "exit"
)
# Fill in the replace argument in add.rule()
add.rule(strategy.st, name = "ruleSignal",
arguments = list(sigcol = "thresholdexit", sigval = TRUE, orderqty = "all",
ordertype = "market", orderside = "long", replace = FALSE, prefer = "Open"
),
type = "exit"
)
# Fill in the prefer argument in add.rule()
add.rule(strategy.st, name = "ruleSignal",
arguments = list(sigcol = "thresholdexit", sigval = TRUE, orderqty = "all",
ordertype = "market", orderside = "long", replace = FALSE, prefer = "Open"
),
type = "exit"
)
# Create an entry rule of 1 share when all conditions line up to enter into a position
add.rule(strategy.st, name = "ruleSignal",
arguments=list(sigcol = "longentry", sigval = TRUE, orderqty = 1, ordertype = "market",
orderside = "long", replace = FALSE, prefer = "Open"
),
type = "enter"
)
# Add a rule that uses an osFUN to size an entry position
add.rule(strategy = strategy.st, name = "ruleSignal",
arguments = list(sigcol = "longentry", sigval = TRUE, ordertype = "market",
orderside = "long", replace = FALSE, prefer = "Open",
osFUN = osMaxDollar, tradeSize = tradesize, maxSize = tradesize
),
type = "enter"
)
Chapter 6 - Analyzing Results
Analyzing strategy:
Visualizing strategy:
Additional analytics:
Example code includes:
# DO Not RUN - do not have package 'quantstrat'
# Use applyStrategy() to apply your strategy. Save this to out
out <- applyStrategy(strategy = strategy.st, portfolios = portfolio.st)
# Update your portfolio (portfolio.st)
updatePortf(portfolio.st)
daterange <- time(getPortfolio(portfolio.st)$summary)[-1]
# Update your account (account.st)
updateAcct(account.st, daterange)
updateEndEq(account.st)
# What is the date of the last trade?
"2013-12-23"
# Get the tradeStats for your portfolio
tstats <- tradeStats(Portfolios = portfolio.st)
# Print the profit factor
tstats$Profit.Factor
# Use chart.Posn to view your system's performance on SPY
chart.Posn(Portfolio = portfolio.st, Symbol = "SPY")
# Compute the SMA50
sma50 <- SMA(x = Cl(SPY), n = 50)
# Compute the SMA200
sma200 <- SMA(x = Cl(SPY), n = 200)
# Compute the DVO_2_126 with an navg of 2 and a percentlookback of 126
dvo <- DVO(HLC = HLC(SPY), navg = 2, percentlookback = 126)
# Recreate the chart.Posn of the strategy from the previous exercise
chart.Posn(Portfolio = portfolio.st, Symbol = "SPY")
# Overlay the SMA50 on your plot as a blue line
add_TA(sma50, on = 1, col = "blue")
# Overlay the SMA200 on your plot as a red line
add_TA(sma200, on = 1, col = "red")
# Add the DVO_2_126 to the plot in a new window
add_TA(dvo)
portpl <- .blotter$portfolio.firststrat$summary$Net.Trading.PL
SharpeRatio.annualized(portpl, geometric=FALSE)
# Get instrument returns
instrets <- PortfReturns(portfolio.st)
# Compute Sharpe ratio from returns
SharpeRatio.annualized(instrets, geometric = FALSE)
Chapter 1 - Valuation of Cash Flows
Cash flows and discounting - suppose you have a unit of capital and a unit of time (t=0 is now and t=K is time in the future and Ck is cash paid at time K):
Valuation - consider v(s, t) to be the value at time s of 1 Euro paid at time t:
Actuarial equivalence:
Change of period and term structure:
Example code includes:
# Define the cash flows
cash_flows <- c(rep(30, 4), 130, 230)
# Define i and v
i <- 0.02
v <- 1 / (1 + i)
# Define the discount factors
discount_factors <- v ^ (0:5)
# Calculate the present value
present_value <- sum(discount_factors * cash_flows)
present_value
## [1] 445
# Define the cash flows
cash_flows <- c(0, rep(3000, 3), rep(1000, 2))
# Define the discount factors
discount_factors <- (1 + 0.05) ^ -(0:5)
# Calculate the net present value
net_present_value <- sum(discount_factors * cash_flows) - 10000
net_present_value
## [1] -224
# Define the discount function v
discount <- function(s, t, i = 0.02) {
(1 + i) ^ -(t - s)
}
# Calculate the present value
present_value <- sum(cash_flows * discount(0, 0:5))
present_value
## [1] 10481
# Calculate the value at time 6
sum(cash_flows * discount(6, 0:5))
## [1] 11804
# Calculate the value at time 6, starting from present_value
present_value * discount(6, 0)
## [1] 11804
# Calculate the present value of the deposits
# PV_deposit <- sum(deposits * discount_factors)
# Calculate the present value of the payments
# PV_payment <- sum(payments * discount_factors)
# Calculate the yearly deposit K in the first 4 years
# K <- PV_payment / PV_deposit
# K
# Interest rates
interest <- c(rep(0.05, 3), rep(0.06, 3), rep(0.07, 5))
# Define the yearly discount factors
yearly_discount_factors <- (1+interest) ^ ( - 1)
# Define the discount factors
discount_factors <- c(1 , cumprod(yearly_discount_factors))
# Define the cash flow vector
cash_flow <- c(1000, 5000, rep(-816.86, 10))
# Calculate the PV
PV <- sum(cash_flow * discount_factors)
PV
## [1] -0.0352
# Define the number of payments
number_payments <- 20*12
# Define the yearly interest rate
i <- 0.0304
# Calculate the monthly interest rate
monthly_interest <- (1 + i) ^ (1/12) - 1
monthly_interest
## [1] 0.0025
# Define the discount factors
discount_factors <- (1+monthly_interest) ^ - (1:number_payments)
# Define the payment pattern
payments <- rep(1, number_payments)
# Calculate the monthly loan payment K
K <- 125000 / sum(payments * discount_factors)
K
## [1] 693
Chapter 2 - Life Tables
Random future lifetime - often used to express future life/mortality for a policyholder of age X:
Binomial experiments - q(x) and p(x):
Calculating probabilities:
Calculating life expectancies:
Dynamics - mortality has been changing over time, and tables are published regularly:
Example code includes:
# The mortality rates have been obtained from the Human Mortality Database and are stored in the data set life_table
life_table_1841_2015 <- read_csv2("./RInputFiles/life_table_females.csv")
## Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
## Parsed with column specification:
## cols(
## year = col_double(),
## age = col_double(),
## qx = col_double(),
## lx = col_double(),
## dx = col_double(),
## ex = col_double()
## )
life_table_1999 <- read_csv2("./RInputFiles/life_table_females_1999.csv")
## Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
## Parsed with column specification:
## cols(
## age = col_double(),
## qx = col_double(),
## lx = col_double(),
## dx = col_double(),
## ex = col_double()
## )
str(life_table_1841_2015)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 19425 obs. of 6 variables:
## $ year: num 1841 1841 1841 1841 1841 ...
## $ age : num 0 1 2 3 4 5 6 7 8 9 ...
## $ qx : num 0.138 0.0722 0.0408 0.0251 0.0183 ...
## $ lx : num 100000 86201 79976 76708 74780 ...
## $ dx : num 13799 6225 3267 1928 1370 ...
## $ ex : num 40.3 45.7 48.2 49.3 49.5 ...
## - attr(*, "spec")=
## .. cols(
## .. year = col_double(),
## .. age = col_double(),
## .. qx = col_double(),
## .. lx = col_double(),
## .. dx = col_double(),
## .. ex = col_double()
## .. )
str(life_table_1999)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 111 obs. of 5 variables:
## $ age: num 0 1 2 3 4 5 6 7 8 9 ...
## $ qx : num 0.00415 0.00039 0.00011 0.00018 0.00018 0.00012 0.00016 0.00018 0.0001 0.00008 ...
## $ lx : num 100000 99585 99546 99536 99518 ...
## $ dx : num 415 39 10 17 18 12 16 18 10 8 ...
## $ ex : num 80.9 80.2 79.2 78.3 77.3 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. qx = col_double(),
## .. lx = col_double(),
## .. dx = col_double(),
## .. ex = col_double()
## .. )
life_table <- life_table_1999
# Inspect life_table using str(), head() and tail()
str(life_table)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 111 obs. of 5 variables:
## $ age: num 0 1 2 3 4 5 6 7 8 9 ...
## $ qx : num 0.00415 0.00039 0.00011 0.00018 0.00018 0.00012 0.00016 0.00018 0.0001 0.00008 ...
## $ lx : num 100000 99585 99546 99536 99518 ...
## $ dx : num 415 39 10 17 18 12 16 18 10 8 ...
## $ ex : num 80.9 80.2 79.2 78.3 77.3 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. qx = col_double(),
## .. lx = col_double(),
## .. dx = col_double(),
## .. ex = col_double()
## .. )
head(life_table)
## # A tibble: 6 x 5
## age qx lx dx ex
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.00415 100000 415 80.9
## 2 1 0.00039 99585 39 80.2
## 3 2 0.00011 99546 10 79.2
## 4 3 0.00018 99536 17 78.3
## 5 4 0.00018 99518 18 77.3
## 6 5 0.00012 99501 12 76.3
tail(life_table)
## # A tibble: 6 x 5
## age qx lx dx ex
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 105 0.486 73 35 1.49
## 2 106 0.504 37 19 1.43
## 3 107 0.520 19 10 1.38
## 4 108 0.536 9 5 1.34
## 5 109 0.550 4 2 1.3
## 6 110 1 2 2 1.28
# Define age, qx and ex
age <- life_table$age
qx <- life_table$qx
ex <- life_table$ex
lx <- life_table$lx
dx <- life_table$dx
px <- 1 - qx
# The probability that (18) dies before turning 19
qx[age==18]
## [1] 0.00031
# The expected future lifetime of (18)
ex[age==18]
## [1] 63.4
par(mfrow=c(1, 1))
par(mfcol=c(1, 1))
# Plot the female mortality rates in the year 1999
plot(age, qx, main = "Mortality rates (Belgium, females, 1999)",
xlab = "Age x", ylab = expression(paste("Mortality rate ", q[x])), type = "l"
)
# Plot the logarithm of the female mortality rates in the year 1999
plot(age, log(qx), main = "Log mortality rates (Belgium, females, 1999)",
xlab = "Age x", ylab = expression(paste("Log mortality rate ", log(q[x]))), type = "l"
)
# Compute the probabilty for (0) to reach the age 100
lx[101] / lx[1]
## [1] 0.0117
# Compute the probabilty for (18) to reach the age 100
lx[101] / lx[19]
## [1] 0.0118
# Plot the survival probabilties for (18) up to age 100
k <- 0:82
plot(k, lx[19+k]/lx[19], pch = 20, xlab = "k",
ylab = expression(paste(""[k], "p"[18])), main = "Survival probabilities for (18)"
)
# Plot the number of deaths dx by age
plot(age, dx, type = "h", pch = 20, xlab = "Age x",
ylab = expression("d"[x]), main = "Number of deaths (Belgium, females, 1999)"
)
# Simulate the number of deaths using a binomial distribution
sims <- rbinom(n = length(lx), size = lx, prob = qx)
# Plot the simulated number of deaths on top of the previous graph
points(age, sims, pch = 4, col = "red")
# Calculate the probability that (18) survives 5 more years
prod(px[(18+1):(18+5)])
## [1] 0.998
# Compute the survival probabilities of (18) until the age of 100
kpx <- cumprod(px[(18+1):(100)])
# Extract the probability that (18) survives until the age of 100
kpx[length(kpx)]
## [1] 0.0117
# Plot the probabilties for (18) to reach the age of 19, 20, ..., 100
plot(1:length(kpx), kpx, pch = 20, xlab = "k",
ylab = expression(paste(""[k], "p"[18])), main = "Survival probabilities for (18)"
)
# Compute the survival probabilities of (18)
kpx <- c(1, cumprod(px[(18+1):(length(px) - 1)]))
# Compute the deferred mortality probabilities of (18)
kqx <- kpx * qx[(18+1):length(px)]
# Print the sum of kqx
sum(kqx)
## [1] 1
# Plot the deferred mortality probabilities of (18)
plot(0:(length(kqx)-1), kqx, pch = 20, xlab = "k",
ylab = expression(paste(""['k|'], "q"[18])), main = "Deferred mortality probabilities of (18)"
)
# Survival probabilities and curtate expected future lifetime of (0)
kp0 <- cumprod(px)
sum(kp0)
## [1] 80.4
# Survival probabilities and curtate expected future lifetime of (18)
kp18 <- cumprod(px[(18+1):length(px)])
sum(kp18)
## [1] 62.9
# Complete expected future lifetime of (0) and (18)
ex[c(1, 19)]
## [1] 80.9 63.4
# Function to compute the curtate expected future lifetime for a given age and life table
curtate_future_lifetime <- function(age, life_table) {
px <- 1 - life_table$qx
kpx <- cumprod(px[(age+1):length(px)])
sum(kpx)
}
# Vector of ages
ages <- life_table$age
# Curtate future lifetimes for all ages
future_lifetimes <- sapply(ages, FUN=curtate_future_lifetime, life_table)
# Future lifetime by age
plot(ages, future_lifetimes, type = 'l', lwd = 2, col = "green",
xlab = "Age x", ylab = "Future lifetime", main = "Future lifetime by age"
)
# Explore life_table
life_table <- life_table_1841_2015
str(life_table)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 19425 obs. of 6 variables:
## $ year: num 1841 1841 1841 1841 1841 ...
## $ age : num 0 1 2 3 4 5 6 7 8 9 ...
## $ qx : num 0.138 0.0722 0.0408 0.0251 0.0183 ...
## $ lx : num 100000 86201 79976 76708 74780 ...
## $ dx : num 13799 6225 3267 1928 1370 ...
## $ ex : num 40.3 45.7 48.2 49.3 49.5 ...
## - attr(*, "spec")=
## .. cols(
## .. year = col_double(),
## .. age = col_double(),
## .. qx = col_double(),
## .. lx = col_double(),
## .. dx = col_double(),
## .. ex = col_double()
## .. )
head(life_table)
## # A tibble: 6 x 6
## year age qx lx dx ex
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1841 0 0.138 100000 13799 40.3
## 2 1841 1 0.0722 86201 6225 45.7
## 3 1841 2 0.0408 79976 3267 48.2
## 4 1841 3 0.0251 76708 1928 49.3
## 5 1841 4 0.0183 74780 1370 49.5
## 6 1841 5 0.0138 73411 1015 49.5
range(life_table$year)
## [1] 1841 2015
# Plot the logarithm of the female mortality rates for (18) by year
with(subset(life_table, age == 18),
plot(year, log(qx), type = "l", main = "Log mortality rates (Belgium, females, 18-year-old)",
xlab = "Year t", ylab = expression(paste("Log mortality rate ", log(q[18])))
)
)
# Plot the logarithm of the female mortality rates in the year 1950 by age
with(subset(life_table, year==1950),
plot(age, log(qx), type = "l", main = "Log mortality rates (Belgium, females, 1950)",
xlab = "Age x", ylab = expression(paste("Log mortality rate ", log(q[x])))
)
)
# Construct and print the cohort life table for birth year 1981
life_table_1981 <- subset(life_table, year-age == 1981)
life_table_1981
## # A tibble: 35 x 6
## year age qx lx dx ex
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1981 0 0.00931 100000 931 77.0
## 2 1982 1 0.00067 99024 66 76.9
## 3 1983 2 0.00047 99059 46 75.9
## 4 1984 3 0.00023 99062 23 75.5
## 5 1985 4 0.00022 99024 21 74.6
## 6 1986 5 0.0002 99094 20 73.7
## 7 1987 6 0.00018 99013 18 73.4
## 8 1988 7 0.000150 99027 15 72.7
## 9 1989 8 0.00012 99119 11 71.6
## 10 1990 9 0.0002 99087 20 71.1
## # ... with 25 more rows
# 1981 cohort one-year survival probabilities
px <- 1 - life_table_1981$qx
# 1981 cohort survival probability that (18) survives 5 more years
prod(px[(18+1):(18+5)])
## [1] 0.998
# 1881 cohort survival probability that (18) survives 5 more years
with(subset(life_table, year - age == 1881), prod(1 - qx[(18 + 1):(22 + 1)]))
## [1] 0.975
Chapter 3 - Life Annuities
The basics - simple life annuities:
Whole, temporary, and deferred life annuities:
Guaranteed payments:
On premium payments and retirement plans:
Example code includes:
life_table <- life_table_1999
str(life_table)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 111 obs. of 5 variables:
## $ age: num 0 1 2 3 4 5 6 7 8 9 ...
## $ qx : num 0.00415 0.00039 0.00011 0.00018 0.00018 0.00012 0.00016 0.00018 0.0001 0.00008 ...
## $ lx : num 100000 99585 99546 99536 99518 ...
## $ dx : num 415 39 10 17 18 12 16 18 10 8 ...
## $ ex : num 80.9 80.2 79.2 78.3 77.3 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. qx = col_double(),
## .. lx = col_double(),
## .. dx = col_double(),
## .. ex = col_double()
## .. )
# Define age, qx and ex
age <- life_table$age
qx <- life_table$qx
ex <- life_table$ex
lx <- life_table$lx
dx <- life_table$dx
px <- 1 - qx
# PV of guaranteed payment of 10,000 in 5 years
PV <- 10000 * (1+0.02) ^ -5
PV
## [1] 9057
# 5 year survival probabilities of (20)
kpx <- prod(px[(20+1):(20+5)])
# EPV of pure endowment of 10,000 in 5 years for (20)
PV * kpx
## [1] 9039
# PV of guaranteed payments of 10,000 in 5, 10 and 30 years
PV <- 10000 * (1+0.02) ^ -c(5, 10, 30)
PV
## [1] 9057 8203 5521
# Survival probabilities of (20)
kpx <- cumprod(px[(20+1):length(px)])
# EPV of pure endowments of 10,000 in 5, 10 and 30 years for (20)
PV * kpx[c(5, 10, 30)]
## [1] 9039 8172 5363
# Function to compute the EPV of a whole life annuity due for a given age, interest rate i and life table
life_annuity_due <- function(age, i, life_table) {
px <- 1 - life_table$qx
kpx <- c(1, cumprod(px[(age+1):length(px)]))
discount_factors <- (1+i) ^ -(0:(length(kpx)-1))
sum(discount_factors * kpx)
}
# EPV of a whole life annuity due for (20) at interest rate 2% using life_table
life_annuity_due(age=20, i=0.02, life_table=life_table)
## [1] 35.5
# EPV of a whole life annuity due for (20) at interest rate 5% and for (65) at interest rate 2% using life_table
life_annuity_due(age=20, i=0.05, life_table=life_table)
## [1] 19.7
life_annuity_due(age=65, i=0.02, life_table=life_table)
## [1] 16.2
# EPV of a whole life annuity due for (20) at interest rate 2% using life_table
life_annuity_due(20, 0.02, life_table)
## [1] 35.5
# Function to compute the EPV of a whole life immediate annuity for a given age, interest rate i and life table
life_immediate_annuity <- function(age, i, life_table) {
px <- 1 - life_table$qx
kpx <- cumprod(px[(age + 1):length(px)])
discount_factors <- (1+i) ^ -(1:(length(kpx)))
sum(discount_factors * kpx)
}
# EPV of a whole life immediate annuity for (20) at interest rate 2% using life_table
life_immediate_annuity(20, 0.02, life_table)
## [1] 34.5
# EPV of a whole life annuity due for (20) at interest rate 2% using life_table
life_annuity_due(20, 0.02, life_table)
## [1] 35.5
# Function to compute the EPV of a temporary life annuity due for a given age, period of n years, interest rate i and life table
temporary_life_annuity_due <- function(age, n, i, life_table) {
px <- 1 - life_table$qx
kpx <- c(1, cumprod(px[(age+1):(age+n-1)]))
discount_factors <- (1 + i) ^ - (0:(n-1))
sum(discount_factors*kpx)
}
# EPV of a temporary life annuity due for (20) over 10 years at interest rate 2% using life_table
temporary_life_annuity_due(age=20, n=10, i=0.02, life_table)
## [1] 9.15
# Pension benefits
benefits <- 20000 * (1+0.02) ^ (0:35)
# Discount factors (to age 65)
discount_factors <- (1+0.04) ^ -(0:35)
# PV of pension at age 65
PV_65 <- sum(discount_factors * benefits)
PV_65
## [1] 523061
# PV of pension at age 20
PV_20 <- PV_65 * (1+0.03) ^ -45
PV_20
## [1] 138318
# Survival probabilities of (65) up to age 100
kpx <- c(1, cumprod(px[(65+1):(100)]))
# EPV of pension at age 65
EPV_65 <- sum(discount_factors * kpx * benefits)
cbind(PV_65, EPV_65)
## PV_65 EPV_65
## [1,] 523061 325981
# EPV of pension at age 20
EPV_20 <- EPV_65 * (1.03 ^ -45 * prod(px[(20+1):(65)]))
cbind(PV_20, EPV_20)
## PV_20 EPV_20
## [1,] 138318 77774
# Survival probabilities of (40)
kpx <- c(1, cumprod(px[(40+1):length(px)]))
# Discount factors (to age 40)
discount_factors <- (1 + 0.03) ^ -(0:(length(kpx)-1))
# Pension benefits
benefits <- c(rep(0, 25), rep(18000, length(kpx) - 25))
# The single premium
single_premium <- sum(benefits * discount_factors * kpx)
single_premium
## [1] 115843
# Premium pattern rho
rho <- c(rep(1, 15), rep(0.5, 10), rep(0, length(kpx)-25))
# The initial premium
initial_premium <- single_premium / sum(rho * discount_factors * kpx)
initial_premium
## [1] 7817
# The annual premiums
initial_premium * rho
## [1] 7817 7817 7817 7817 7817 7817 7817 7817 7817 7817 7817 7817 7817 7817 7817
## [16] 3908 3908 3908 3908 3908 3908 3908 3908 3908 3908 0 0 0 0 0
## [31] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [46] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [61] 0 0 0 0 0 0 0 0 0 0 0 0
# Sum of the annual premiums (no actuarial discounting)
sum(initial_premium * rho)
## [1] 156331
# Curtate life expectancy of (40)
sum(kpx[-1])
## [1] 41.5
# Present value of annuity benefits when (40) lives until age 75
subset1 <- 1:36
sum(benefits[subset1] * discount_factors[subset1])
## [1] 81930
# Present value of annuity benefits when (40) lives until age 95
subset2 <- 1:56
sum(benefits[subset2] * discount_factors[subset2])
## [1] 177100
Chapter 4 - Life Insurances
Basics - life insurance is somewhat the inverse of the life annuity (pays benefits upon death):
Whole, temporary, and deferred:
Combined benefits:
Wrap up:
Example code includes:
# 10-year survival probability of (20)
kpx <- prod(px[(20+1):(30)])
kpx
## [1] 0.996
# 10-year deferred mortality probability of (20)
kqx <- kpx * qx[30+1]
kqx
## [1] 0.000438
# Discount factor
discount_factor <- (1 + 0.01) ^ -11
discount_factor
## [1] 0.896
# EPV of the simple life insurance
10000 * discount_factor * kqx
## [1] 3.93
plot_by_age <- function() {
ages <- 0:100
EPV <- sapply(ages, whole_life_insurance, i = 0.03, life_table = life_table)
plot(ages, EPV, type = 'l', col = "red", ylim = c(0, 1),
main = "Whole life insurance (interest rate i = 3%)", xlab = "Age x", ylab = "EPV"
)
}
plot_by_interest_rate <- function() {
interest_rates <- seq(0.001, 0.10, by = 0.001)
EPV <- sapply(interest_rates, whole_life_insurance, age = 20, life_table = life_table)
plot(interest_rates, EPV, type = 'l', col = "red", ylim = c(0, 1),
main = "Whole life insurance (age 20)", xlab = "Interest rate i", ylab = "EPV"
)
}
# Function to compute the EPV of a whole life insurance
whole_life_insurance <- function(age, i, life_table) {
qx <- life_table$qx
px <- 1 - qx
kpx <- c(1, cumprod(px[(age+1):(length(px) - 1)]))
kqx <- kpx * qx[(age+1):length(qx)]
discount_factors <- (1 + i) ^ - (1:length(kqx))
sum(discount_factors * kqx)
}
# Plot the EPV of a whole life insurance for a range of ages at interest rate 3% using life_table
plot_by_age()
# Plot the EPV of a whole life insurance for (20) for a range of interest rates using life_table
plot_by_interest_rate()
# EPV of a whole life insurance for (20) at interest rate 2% using life_table
whole_life_insurance(20, 0.02, life_table)
## [1] 0.303
# Function to compute the EPV of a temporary life insurance
temporary_life_insurance <- function(age, n, i, life_table) {
qx <- life_table$qx
px <- 1 - qx
kpx <- c(1, cumprod(px[(age+1):(age+n-1)]))
kqx <- kpx * qx[(age+1):(age+n)]
discount_factors <- (1 + i) ^ -(1:length(kqx))
sum(discount_factors * kqx)
}
# EPV of a temporary life insurance for (20) over a period of 45 years at interest rate 2% using life_table
temporary_life_insurance(age=20, n=45, i=0.02, life_table)
## [1] 0.051
# EPV of a whole life insurance for (20) at interest rate 2% using life_table
whole_life_insurance(20, 0.02, life_table)
## [1] 0.303
# Function to compute the EPV of a deferred whole life insurance
deferred_life_insurance <- function(age, u, i, life_table) {
qx <- life_table$qx; px <- 1 - qx
kpx <- c(1, cumprod(px[(age + 1):(length(px) - 1)]))
kqx <- kpx * qx[(age + 1):length(qx)]
discount_factors <- (1 + i) ^ - (1:length(kqx))
benefits <- c(rep(0, u), rep(1, length(kpx) - u))
sum(benefits * discount_factors * kqx)
}
# EPV of a deferred life insurance for (20) deferred over 45 years at interest rate 2% using life_table
deferred_life_insurance(age=20, u=45, i=0.02, life_table)
## [1] 0.252
i <- 0.05
# Deferred mortality probabilites of (48)
kqx <- c(1, cumprod(px[(48+1):(48+27-1)])) * qx[(48+1):(48+27)]
# Discount factors
discount_factors <- (1 + i) ^ -(1:length(kqx))
# Death benefits
benefits <- c(rep(0, 7), rep(40000, length(kqx) - 7))
# EPV of the death benefits
EPV_death_benefits <- sum(benefits * discount_factors * kqx)
EPV_death_benefits
## [1] 2835
# Pure endowment
EPV_pure_endowment <- 80000 * (1 + i) ^ -27 * prod(px[(48+1):(48+27)])
EPV_pure_endowment
## [1] 17050
# Premium pattern
kpx <- c(1, cumprod(px[(48+1):(48+27-1)]))
discount_factors <- (1+i) ^ - (0:(length(kpx) - 1))
rho <- rep(1, length(kpx))
EPV_rho <- sum(kpx * discount_factors * rho)
EPV_rho
## [1] 14.7
# Premium level
(EPV_pure_endowment + EPV_death_benefits) / EPV_rho
## [1] 1355
Chapter 1 - Transforming Data with dplyr
Counties Dataset:
Filter and Arrange Verbs:
Mutate:
Example code includes:
counties <- readRDS("./RInputFiles/counties.rds")
babynames <- readRDS("./RInputFiles/babynames.rds")
str(counties)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 3138 obs. of 40 variables:
## $ census_id : chr "1001" "1003" "1005" "1007" ...
## $ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ county : chr "Autauga" "Baldwin" "Barbour" "Bibb" ...
## $ region : chr "South" "South" "South" "South" ...
## $ metro : chr "Metro" "Metro" "Nonmetro" "Metro" ...
## $ population : num 55221 195121 26932 22604 57710 ...
## $ men : num 26745 95314 14497 12073 28512 ...
## $ women : num 28476 99807 12435 10531 29198 ...
## $ hispanic : num 2.6 4.5 4.6 2.2 8.6 4.4 1.2 3.5 0.4 1.5 ...
## $ white : num 75.8 83.1 46.2 74.5 87.9 22.2 53.3 73 57.3 91.7 ...
## $ black : num 18.5 9.5 46.7 21.4 1.5 70.7 43.8 20.3 40.3 4.8 ...
## $ native : num 0.4 0.6 0.2 0.4 0.3 1.2 0.1 0.2 0.2 0.6 ...
## $ asian : num 1 0.7 0.4 0.1 0.1 0.2 0.4 0.9 0.8 0.3 ...
## $ pacific : num 0 0 0 0 0 0 0 0 0 0 ...
## $ citizens : num 40725 147695 20714 17495 42345 ...
## $ income : num 51281 50254 32964 38678 45813 ...
## $ income_err : num 2391 1263 2973 3995 3141 ...
## $ income_per_cap : num 24974 27317 16824 18431 20532 ...
## $ income_per_cap_err: num 1080 711 798 1618 708 ...
## $ poverty : num 12.9 13.4 26.7 16.8 16.7 24.6 25.4 20.5 21.6 19.2 ...
## $ child_poverty : num 18.6 19.2 45.3 27.9 27.2 38.4 39.2 31.6 37.2 30.1 ...
## $ professional : num 33.2 33.1 26.8 21.5 28.5 18.8 27.5 27.3 23.3 29.3 ...
## $ service : num 17 17.7 16.1 17.9 14.1 15 16.6 17.7 14.5 16 ...
## $ office : num 24.2 27.1 23.1 17.8 23.9 19.7 21.9 24.2 26.3 19.5 ...
## $ construction : num 8.6 10.8 10.8 19 13.5 20.1 10.3 10.5 11.5 13.7 ...
## $ production : num 17.1 11.2 23.1 23.7 19.9 26.4 23.7 20.4 24.4 21.5 ...
## $ drive : num 87.5 84.7 83.8 83.2 84.9 74.9 84.5 85.3 85.1 83.9 ...
## $ carpool : num 8.8 8.8 10.9 13.5 11.2 14.9 12.4 9.4 11.9 12.1 ...
## $ transit : num 0.1 0.1 0.4 0.5 0.4 0.7 0 0.2 0.2 0.2 ...
## $ walk : num 0.5 1 1.8 0.6 0.9 5 0.8 1.2 0.3 0.6 ...
## $ other_transp : num 1.3 1.4 1.5 1.5 0.4 1.7 0.6 1.2 0.4 0.7 ...
## $ work_at_home : num 1.8 3.9 1.6 0.7 2.3 2.8 1.7 2.7 2.1 2.5 ...
## $ mean_commute : num 26.5 26.4 24.1 28.8 34.9 27.5 24.6 24.1 25.1 27.4 ...
## $ employed : num 23986 85953 8597 8294 22189 ...
## $ private_work : num 73.6 81.5 71.8 76.8 82 79.5 77.4 74.1 85.1 73.1 ...
## $ public_work : num 20.9 12.3 20.8 16.1 13.5 15.1 16.2 20.8 12.1 18.5 ...
## $ self_employed : num 5.5 5.8 7.3 6.7 4.2 5.4 6.2 5 2.8 7.9 ...
## $ family_work : num 0 0.4 0.1 0.4 0.4 0 0.2 0.1 0 0.5 ...
## $ unemployment : num 7.6 7.5 17.6 8.3 7.7 18 10.9 12.3 8.9 7.9 ...
## $ land_area : num 594 1590 885 623 645 ...
str(babynames)
## Classes 'tbl_df', 'tbl' and 'data.frame': 332595 obs. of 3 variables:
## $ year : num 1880 1880 1880 1880 1880 1880 1880 1880 1880 1880 ...
## $ name : chr "Aaron" "Ab" "Abbie" "Abbott" ...
## $ number: int 102 5 71 5 6 50 9 12 27 81 ...
# Select the columns
counties %>%
select(state, county, population, poverty)
## # A tibble: 3,138 x 4
## state county population poverty
## <chr> <chr> <dbl> <dbl>
## 1 Alabama Autauga 55221 12.9
## 2 Alabama Baldwin 195121 13.4
## 3 Alabama Barbour 26932 26.7
## 4 Alabama Bibb 22604 16.8
## 5 Alabama Blount 57710 16.7
## 6 Alabama Bullock 10678 24.6
## 7 Alabama Butler 20354 25.4
## 8 Alabama Calhoun 116648 20.5
## 9 Alabama Chambers 34079 21.6
## 10 Alabama Cherokee 26008 19.2
## # ... with 3,128 more rows
counties_selected <- counties %>%
select(state, county, population, private_work, public_work, self_employed)
# Add a verb to sort in descending order of public_work
counties_selected %>%
arrange(desc(public_work))
## # A tibble: 3,138 x 6
## state county population private_work public_work self_employed
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Hawaii Kalawao 85 25 64.1 10.9
## 2 Alaska Yukon-Koyukuk Ce~ 5644 33.3 61.7 5.1
## 3 Wisconsin Menominee 4451 36.8 59.1 3.7
## 4 North Da~ Sioux 4380 32.9 56.8 10.2
## 5 South Da~ Todd 9942 34.4 55 9.8
## 6 Alaska Lake and Peninsu~ 1474 42.2 51.6 6.1
## 7 Californ~ Lassen 32645 42.6 50.5 6.8
## 8 South Da~ Buffalo 2038 48.4 49.5 1.8
## 9 South Da~ Dewey 5579 34.9 49.2 14.7
## 10 Texas Kenedy 565 51.9 48.1 0
## # ... with 3,128 more rows
counties_selected <- counties %>%
select(state, county, population)
# Filter for counties in the state of California that have a population above 1000000
counties_selected %>%
filter(state=="California", population > 1000000)
## # A tibble: 9 x 3
## state county population
## <chr> <chr> <dbl>
## 1 California Alameda 1584983
## 2 California Contra Costa 1096068
## 3 California Los Angeles 10038388
## 4 California Orange 3116069
## 5 California Riverside 2298032
## 6 California Sacramento 1465832
## 7 California San Bernardino 2094769
## 8 California San Diego 3223096
## 9 California Santa Clara 1868149
counties_selected <- counties %>%
select(state, county, population, private_work, public_work, self_employed)
# Filter for Texas and more than 10000 people; sort in descending order of private_work
counties_selected %>%
filter(state=="Texas", population > 10000) %>%
arrange(desc(private_work))
## # A tibble: 169 x 6
## state county population private_work public_work self_employed
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Texas Gregg 123178 84.7 9.8 5.4
## 2 Texas Collin 862215 84.1 10 5.8
## 3 Texas Dallas 2485003 83.9 9.5 6.4
## 4 Texas Harris 4356362 83.4 10.1 6.3
## 5 Texas Andrews 16775 83.1 9.6 6.8
## 6 Texas Tarrant 1914526 83.1 11.4 5.4
## 7 Texas Titus 32553 82.5 10 7.4
## 8 Texas Denton 731851 82.2 11.9 5.7
## 9 Texas Ector 149557 82 11.2 6.7
## 10 Texas Moore 22281 82 11.7 5.9
## # ... with 159 more rows
counties_selected <- counties %>%
select(state, county, population, public_work)
# Sort in descending order of the public_workers column
counties_selected %>%
mutate(public_workers = public_work * population / 100) %>%
arrange(desc(public_workers))
## # A tibble: 3,138 x 5
## state county population public_work public_workers
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 California Los Angeles 10038388 11.5 1154415.
## 2 Illinois Cook 5236393 11.5 602185.
## 3 California San Diego 3223096 14.8 477018.
## 4 Arizona Maricopa 4018143 11.7 470123.
## 5 Texas Harris 4356362 10.1 439993.
## 6 New York Kings 2595259 14.4 373717.
## 7 California San Bernardino 2094769 16.7 349826.
## 8 California Riverside 2298032 14.9 342407.
## 9 California Sacramento 1465832 21.8 319551.
## 10 California Orange 3116069 10.2 317839.
## # ... with 3,128 more rows
# Select the columns state, county, population, men, and women
counties_selected <- counties %>%
select(state, county, population, men, women)
# Calculate proportion_women as the fraction of the population made up of women
counties_selected %>%
mutate(proportion_women = women / population)
## # A tibble: 3,138 x 6
## state county population men women proportion_women
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Alabama Autauga 55221 26745 28476 0.516
## 2 Alabama Baldwin 195121 95314 99807 0.512
## 3 Alabama Barbour 26932 14497 12435 0.462
## 4 Alabama Bibb 22604 12073 10531 0.466
## 5 Alabama Blount 57710 28512 29198 0.506
## 6 Alabama Bullock 10678 5660 5018 0.470
## 7 Alabama Butler 20354 9502 10852 0.533
## 8 Alabama Calhoun 116648 56274 60374 0.518
## 9 Alabama Chambers 34079 16258 17821 0.523
## 10 Alabama Cherokee 26008 12975 13033 0.501
## # ... with 3,128 more rows
counties %>%
# Select the five columns
select(state, county, population, men, women) %>%
# Add the proportion_men variable
mutate(proportion_men = men/population) %>%
# Filter for population of at least 10,000
filter(population >= 10000) %>%
# Arrange proportion of men in descending order
arrange(desc(proportion_men))
## # A tibble: 2,437 x 6
## state county population men women proportion_men
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Virginia Sussex 11864 8130 3734 0.685
## 2 California Lassen 32645 21818 10827 0.668
## 3 Georgia Chattahoochee 11914 7940 3974 0.666
## 4 Louisiana West Feliciana 15415 10228 5187 0.664
## 5 Florida Union 15191 9830 5361 0.647
## 6 Texas Jones 19978 12652 7326 0.633
## 7 Missouri DeKalb 12782 8080 4702 0.632
## 8 Texas Madison 13838 8648 5190 0.625
## 9 Virginia Greensville 11760 7303 4457 0.621
## 10 Texas Anderson 57915 35469 22446 0.612
## # ... with 2,427 more rows
Chapter 2 - Aggregating Data
Count Verb:
Group By, Summarize, and Ungroup:
The top_n verb:
Example code includes:
# Use count to find the number of counties in each region
counties %>%
count(region, sort=TRUE)
## # A tibble: 4 x 2
## region n
## <chr> <int>
## 1 South 1420
## 2 North Central 1054
## 3 West 447
## 4 Northeast 217
# Find number of counties per state, weighted by citizens
counties %>%
count(state, wt=citizens, sort=TRUE)
## # A tibble: 50 x 2
## state n
## <chr> <dbl>
## 1 California 24280349
## 2 Texas 16864864
## 3 Florida 13933052
## 4 New York 13531404
## 5 Pennsylvania 9710416
## 6 Illinois 8979999
## 7 Ohio 8709050
## 8 Michigan 7380136
## 9 North Carolina 7107998
## 10 Georgia 6978660
## # ... with 40 more rows
counties %>%
# Add population_walk containing the total number of people who walk to work
mutate(population_walk = walk * population / 100) %>%
# Count weighted by the new column
count(state, wt=population_walk, sort=TRUE)
## # A tibble: 50 x 2
## state n
## <chr> <dbl>
## 1 New York 1237938.
## 2 California 1017964.
## 3 Pennsylvania 505397.
## 4 Texas 430783.
## 5 Illinois 400346.
## 6 Massachusetts 316765.
## 7 Florida 284723.
## 8 New Jersey 273047.
## 9 Ohio 266911.
## 10 Washington 239764.
## # ... with 40 more rows
# Summarize to find minimum population, maximum unexployment, and average income
counties %>%
summarize(min_population=min(population),
max_unemployment=max(unemployment),
average_income=mean(income)
)
## # A tibble: 1 x 3
## min_population max_unemployment average_income
## <dbl> <dbl> <dbl>
## 1 85 29.4 46832.
# Add a density column, then sort in descending order
counties %>%
group_by(state) %>%
summarize(total_area = sum(land_area), total_population = sum(population)) %>%
mutate(density = total_population / total_area) %>%
arrange(desc(density))
## # A tibble: 50 x 4
## state total_area total_population density
## <chr> <dbl> <dbl> <dbl>
## 1 New Jersey 7354. 8904413 1211.
## 2 Rhode Island 1034. 1053661 1019.
## 3 Massachusetts 7800. 6705586 860.
## 4 Connecticut 4842. 3593222 742.
## 5 Maryland 9707. 5930538 611.
## 6 Delaware 1949. 926454 475.
## 7 New York 47126. 19673174 417.
## 8 Florida 53625. 19645772 366.
## 9 Pennsylvania 44743. 12779559 286.
## 10 Ohio 40861. 11575977 283.
## # ... with 40 more rows
# Calculate the average_pop and median_pop columns
counties %>%
group_by(region, state) %>%
summarize(total_pop = sum(population)) %>%
summarize(average_pop = mean(total_pop), median_pop=median(total_pop))
## # A tibble: 4 x 3
## region average_pop median_pop
## <chr> <dbl> <dbl>
## 1 North Central 5627687. 5580644
## 2 Northeast 6221058. 3593222
## 3 South 7370486 4804098
## 4 West 5722755. 2798636
# Group by region and find the greatest number of citizens who walk to work
counties %>%
group_by(region) %>%
top_n(1, walk) %>%
select(state, county, region, metro, population, walk, citizens)
## # A tibble: 4 x 7
## # Groups: region [4]
## state county region metro population walk citizens
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Alaska Aleutians East Boro~ West Nonmet~ 3304 71.2 1874
## 2 New York New York Northeast Metro 1629507 20.7 1156936
## 3 North Dako~ McIntosh North Cent~ Nonmet~ 2759 17.5 2239
## 4 Virginia Lexington city South Nonmet~ 7071 31.7 6261
counties %>%
group_by(region, state) %>%
# Calculate average income
summarize(average_income=mean(income)) %>%
# Find the highest income state in each region
top_n(1, average_income)
## # A tibble: 4 x 3
## # Groups: region [4]
## region state average_income
## <chr> <chr> <dbl>
## 1 North Central North Dakota 55575.
## 2 Northeast New Jersey 73014.
## 3 South Maryland 69200.
## 4 West Alaska 65125.
# Count the states with more people in Metro or Nonmetro areas
counties %>%
group_by(state, metro) %>%
summarize(total_pop = sum(population)) %>%
top_n(1, total_pop) %>%
ungroup() %>%
count(metro)
## # A tibble: 2 x 2
## metro n
## <chr> <int>
## 1 Metro 44
## 2 Nonmetro 6
Chapter 3 - Selecting and Transforming Data
Selecting:
Renaming:
Transmuting:
Example code includes:
# Glimpse the counties table
glimpse(counties)
## Observations: 3,138
## Variables: 40
## $ census_id <chr> "1001", "1003", "1005", "1007", "1009", "1011", ...
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Ala...
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount...
## $ region <chr> "South", "South", "South", "South", "South", "So...
## $ metro <chr> "Metro", "Metro", "Nonmetro", "Metro", "Metro", ...
## $ population <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354...
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 5...
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, ...
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5...
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, ...
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40...
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6...
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3...
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...
## $ citizens <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581,...
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229,...
## $ income_err <dbl> 2391, 1263, 2973, 3995, 3141, 5884, 1793, 925, 2...
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390,...
## $ income_per_cap_err <dbl> 1080, 711, 798, 1618, 708, 2055, 714, 489, 1366,...
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, ...
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, ...
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, ...
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, ...
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, ...
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 1...
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, ...
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, ...
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11....
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2...
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6...
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7...
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5...
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, ...
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 474...
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, ...
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, ...
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9...
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5...
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9,...
## $ land_area <dbl> 594, 1590, 885, 623, 645, 623, 777, 606, 597, 55...
counties %>%
# Select state, county, population, and industry-related columns
select(state, county, population, professional:production) %>%
# Arrange service in descending order
arrange(desc(service))
## # A tibble: 3,138 x 8
## state county population professional service office construction production
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Missis~ Tunica 10477 23.9 36.6 21.5 3.5 14.5
## 2 Texas Kinney 3577 30 36.5 11.6 20.5 1.3
## 3 Texas Kenedy 565 24.9 34.1 20.5 20.5 0
## 4 New Yo~ Bronx 1428357 24.3 33.3 24.2 7.1 11
## 5 Texas Brooks 7221 19.6 32.4 25.3 11.1 11.5
## 6 Colora~ Fremo~ 46809 26.6 32.2 22.8 10.7 7.6
## 7 Texas Culbe~ 2296 20.1 32.2 24.2 15.7 7.8
## 8 Califo~ Del N~ 27788 33.9 31.5 18.8 8.9 6.8
## 9 Minnes~ Mahno~ 5496 26.8 31.5 18.7 13.1 9.9
## 10 Virgin~ Lanca~ 11129 30.3 31.2 22.8 8.1 7.6
## # ... with 3,128 more rows
counties %>%
# Select the state, county, population, and those ending with "work"
select(state, county, population, ends_with("work")) %>%
# Filter for counties that have at least 50% of people engaged in public work
filter(public_work >= 50)
## # A tibble: 7 x 6
## state county population private_work public_work family_work
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Alaska Lake and Peninsula~ 1474 42.2 51.6 0.2
## 2 Alaska Yukon-Koyukuk Cens~ 5644 33.3 61.7 0
## 3 California Lassen 32645 42.6 50.5 0.1
## 4 Hawaii Kalawao 85 25 64.1 0
## 5 North Dak~ Sioux 4380 32.9 56.8 0.1
## 6 South Dak~ Todd 9942 34.4 55 0.8
## 7 Wisconsin Menominee 4451 36.8 59.1 0.4
# Rename the n column to num_counties
counties %>%
count(state) %>%
rename(num_counties=n)
## # A tibble: 50 x 2
## state num_counties
## <chr> <int>
## 1 Alabama 67
## 2 Alaska 28
## 3 Arizona 15
## 4 Arkansas 75
## 5 California 58
## 6 Colorado 64
## 7 Connecticut 8
## 8 Delaware 3
## 9 Florida 67
## 10 Georgia 159
## # ... with 40 more rows
# Select state, county, and poverty as poverty_rate
counties %>%
select(state, county, poverty_rate = poverty)
## # A tibble: 3,138 x 3
## state county poverty_rate
## <chr> <chr> <dbl>
## 1 Alabama Autauga 12.9
## 2 Alabama Baldwin 13.4
## 3 Alabama Barbour 26.7
## 4 Alabama Bibb 16.8
## 5 Alabama Blount 16.7
## 6 Alabama Bullock 24.6
## 7 Alabama Butler 25.4
## 8 Alabama Calhoun 20.5
## 9 Alabama Chambers 21.6
## 10 Alabama Cherokee 19.2
## # ... with 3,128 more rows
counties %>%
# Keep the state, county, and populations columns, and add a density column
transmute(state, county, population, density = population / land_area) %>%
# Filter for counties with a population greater than one million
filter(population > 1000000) %>%
# Sort density in ascending order
arrange(density)
## # A tibble: 41 x 4
## state county population density
## <chr> <chr> <dbl> <dbl>
## 1 California San Bernardino 2094769 104.
## 2 Nevada Clark 2035572 258.
## 3 California Riverside 2298032 319.
## 4 Arizona Maricopa 4018143 437.
## 5 Florida Palm Beach 1378806 700.
## 6 California San Diego 3223096 766.
## 7 Washington King 2045756 967.
## 8 Texas Travis 1121645 1133.
## 9 Florida Hillsborough 1302884 1277.
## 10 Florida Orange 1229039 1360.
## # ... with 31 more rows
# Change the name of the unemployment column
counties %>%
rename(unemployment_rate = unemployment)
## # A tibble: 3,138 x 40
## census_id state county region metro population men women hispanic white
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alab~ Autau~ South Metro 55221 26745 28476 2.6 75.8
## 2 1003 Alab~ Baldw~ South Metro 195121 95314 99807 4.5 83.1
## 3 1005 Alab~ Barbo~ South Nonm~ 26932 14497 12435 4.6 46.2
## 4 1007 Alab~ Bibb South Metro 22604 12073 10531 2.2 74.5
## 5 1009 Alab~ Blount South Metro 57710 28512 29198 8.6 87.9
## 6 1011 Alab~ Bullo~ South Nonm~ 10678 5660 5018 4.4 22.2
## 7 1013 Alab~ Butler South Nonm~ 20354 9502 10852 1.2 53.3
## 8 1015 Alab~ Calho~ South Metro 116648 56274 60374 3.5 73
## 9 1017 Alab~ Chamb~ South Nonm~ 34079 16258 17821 0.4 57.3
## 10 1019 Alab~ Chero~ South Nonm~ 26008 12975 13033 1.5 91.7
## # ... with 3,128 more rows, and 30 more variables: black <dbl>, native <dbl>,
## # asian <dbl>, pacific <dbl>, citizens <dbl>, income <dbl>, income_err <dbl>,
## # income_per_cap <dbl>, income_per_cap_err <dbl>, poverty <dbl>,
## # child_poverty <dbl>, professional <dbl>, service <dbl>, office <dbl>,
## # construction <dbl>, production <dbl>, drive <dbl>, carpool <dbl>,
## # transit <dbl>, walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## # mean_commute <dbl>, employed <dbl>, private_work <dbl>, public_work <dbl>,
## # self_employed <dbl>, family_work <dbl>, unemployment_rate <dbl>,
## # land_area <dbl>
# Keep the state and county columns, and the columns containing poverty
counties %>%
select(state, county, contains("poverty"))
## # A tibble: 3,138 x 4
## state county poverty child_poverty
## <chr> <chr> <dbl> <dbl>
## 1 Alabama Autauga 12.9 18.6
## 2 Alabama Baldwin 13.4 19.2
## 3 Alabama Barbour 26.7 45.3
## 4 Alabama Bibb 16.8 27.9
## 5 Alabama Blount 16.7 27.2
## 6 Alabama Bullock 24.6 38.4
## 7 Alabama Butler 25.4 39.2
## 8 Alabama Calhoun 20.5 31.6
## 9 Alabama Chambers 21.6 37.2
## 10 Alabama Cherokee 19.2 30.1
## # ... with 3,128 more rows
# Calculate the fraction_women column without dropping the other columns
counties %>%
mutate(fraction_women = women / population)
## # A tibble: 3,138 x 41
## census_id state county region metro population men women hispanic white
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alab~ Autau~ South Metro 55221 26745 28476 2.6 75.8
## 2 1003 Alab~ Baldw~ South Metro 195121 95314 99807 4.5 83.1
## 3 1005 Alab~ Barbo~ South Nonm~ 26932 14497 12435 4.6 46.2
## 4 1007 Alab~ Bibb South Metro 22604 12073 10531 2.2 74.5
## 5 1009 Alab~ Blount South Metro 57710 28512 29198 8.6 87.9
## 6 1011 Alab~ Bullo~ South Nonm~ 10678 5660 5018 4.4 22.2
## 7 1013 Alab~ Butler South Nonm~ 20354 9502 10852 1.2 53.3
## 8 1015 Alab~ Calho~ South Metro 116648 56274 60374 3.5 73
## 9 1017 Alab~ Chamb~ South Nonm~ 34079 16258 17821 0.4 57.3
## 10 1019 Alab~ Chero~ South Nonm~ 26008 12975 13033 1.5 91.7
## # ... with 3,128 more rows, and 31 more variables: black <dbl>, native <dbl>,
## # asian <dbl>, pacific <dbl>, citizens <dbl>, income <dbl>, income_err <dbl>,
## # income_per_cap <dbl>, income_per_cap_err <dbl>, poverty <dbl>,
## # child_poverty <dbl>, professional <dbl>, service <dbl>, office <dbl>,
## # construction <dbl>, production <dbl>, drive <dbl>, carpool <dbl>,
## # transit <dbl>, walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## # mean_commute <dbl>, employed <dbl>, private_work <dbl>, public_work <dbl>,
## # self_employed <dbl>, family_work <dbl>, unemployment <dbl>,
## # land_area <dbl>, fraction_women <dbl>
# Keep only the state, county, and employment_rate columns
counties %>%
transmute(state, county, employment_rate = employed / population)
## # A tibble: 3,138 x 3
## state county employment_rate
## <chr> <chr> <dbl>
## 1 Alabama Autauga 0.434
## 2 Alabama Baldwin 0.441
## 3 Alabama Barbour 0.319
## 4 Alabama Bibb 0.367
## 5 Alabama Blount 0.384
## 6 Alabama Bullock 0.362
## 7 Alabama Butler 0.384
## 8 Alabama Calhoun 0.406
## 9 Alabama Chambers 0.402
## 10 Alabama Cherokee 0.390
## # ... with 3,128 more rows
Chapter 4 - Case Study
The babynames dataset:
Grouped Mutates:
Window Functions:
Wrap Up:
Example code includes:
babynames %>%
# Filter for the year 1990
filter(year==1990) %>%
# Sort the number column in descending order
arrange(desc(number))
## # A tibble: 21,223 x 3
## year name number
## <dbl> <chr> <int>
## 1 1990 Michael 65560
## 2 1990 Christopher 52520
## 3 1990 Jessica 46615
## 4 1990 Ashley 45797
## 5 1990 Matthew 44925
## 6 1990 Joshua 43382
## 7 1990 Brittany 36650
## 8 1990 Amanda 34504
## 9 1990 Daniel 33963
## 10 1990 David 33862
## # ... with 21,213 more rows
# Find the most common name in each year
babynames %>%
group_by(year) %>%
top_n(1, number)
## # A tibble: 28 x 3
## # Groups: year [28]
## year name number
## <dbl> <chr> <int>
## 1 1880 John 9701
## 2 1885 Mary 9166
## 3 1890 Mary 12113
## 4 1895 Mary 13493
## 5 1900 Mary 16781
## 6 1905 Mary 16135
## 7 1910 Mary 22947
## 8 1915 Mary 58346
## 9 1920 Mary 71175
## 10 1925 Mary 70857
## # ... with 18 more rows
# Filter for the names Steven, Thomas, and Matthew
selected_names <- babynames %>%
filter(name %in% c("Steven", "Thomas", "Matthew"))
# Plot the names using a different color for each name
ggplot(selected_names, aes(x = year, y = number, color = name)) +
geom_line()
# Find the year each name is most common
babynames %>%
group_by(year) %>%
mutate(year_total=sum(number)) %>%
ungroup() %>%
mutate(fraction = number / year_total) %>%
group_by(name) %>%
top_n(1, fraction)
## # A tibble: 48,040 x 5
## # Groups: name [48,040]
## year name number year_total fraction
## <dbl> <chr> <int> <int> <dbl>
## 1 1880 Abbott 5 201478 0.0000248
## 2 1880 Abe 50 201478 0.000248
## 3 1880 Abner 27 201478 0.000134
## 4 1880 Adelbert 28 201478 0.000139
## 5 1880 Adella 26 201478 0.000129
## 6 1880 Adolf 6 201478 0.0000298
## 7 1880 Adolph 93 201478 0.000462
## 8 1880 Agustus 5 201478 0.0000248
## 9 1880 Albert 1493 201478 0.00741
## 10 1880 Albertina 7 201478 0.0000347
## # ... with 48,030 more rows
names_normalized <- babynames %>%
group_by(name) %>%
mutate(name_total = sum(number), name_max = max(number)) %>%
# Ungroup the table
ungroup() %>%
# Add the fraction_max column containing the number by the name maximum
mutate(fraction_max = number / name_max)
names_normalized
## # A tibble: 332,595 x 6
## year name number name_total name_max fraction_max
## <dbl> <chr> <int> <int> <int> <dbl>
## 1 1880 Aaron 102 114739 14635 0.00697
## 2 1880 Ab 5 77 31 0.161
## 3 1880 Abbie 71 4330 445 0.160
## 4 1880 Abbott 5 217 51 0.0980
## 5 1880 Abby 6 11272 1753 0.00342
## 6 1880 Abe 50 1832 271 0.185
## 7 1880 Abel 9 10565 3245 0.00277
## 8 1880 Abigail 12 72600 15762 0.000761
## 9 1880 Abner 27 1552 199 0.136
## 10 1880 Abraham 81 17882 2449 0.0331
## # ... with 332,585 more rows
# Filter for the names Steven, Thomas, and Matthew
names_filtered <- names_normalized %>%
filter(name %in% c("Steven", "Thomas", "Matthew"))
# Visualize these names over time
ggplot(names_filtered, aes(x=year, y=fraction_max, color=name)) +
geom_line()
# Find the year each name is most common
babynames_fraction <- babynames %>%
group_by(year) %>%
mutate(year_total=sum(number)) %>%
ungroup() %>%
mutate(fraction = number / year_total)
babynames_fraction
## # A tibble: 332,595 x 5
## year name number year_total fraction
## <dbl> <chr> <int> <int> <dbl>
## 1 1880 Aaron 102 201478 0.000506
## 2 1880 Ab 5 201478 0.0000248
## 3 1880 Abbie 71 201478 0.000352
## 4 1880 Abbott 5 201478 0.0000248
## 5 1880 Abby 6 201478 0.0000298
## 6 1880 Abe 50 201478 0.000248
## 7 1880 Abel 9 201478 0.0000447
## 8 1880 Abigail 12 201478 0.0000596
## 9 1880 Abner 27 201478 0.000134
## 10 1880 Abraham 81 201478 0.000402
## # ... with 332,585 more rows
babynames_fraction %>%
# Arrange the data in order of name, then year
arrange(name, year) %>%
# Group the data by name
group_by(name) %>%
# Add a ratio column that contains the ratio between each year
mutate(ratio = fraction / lag(fraction))
## # A tibble: 332,595 x 6
## # Groups: name [48,040]
## year name number year_total fraction ratio
## <dbl> <chr> <int> <int> <dbl> <dbl>
## 1 2010 Aaban 9 3672066 0.00000245 NA
## 2 2015 Aaban 15 3648781 0.00000411 1.68
## 3 1995 Aadam 6 3652750 0.00000164 NA
## 4 2000 Aadam 6 3767293 0.00000159 0.970
## 5 2005 Aadam 6 3828460 0.00000157 0.984
## 6 2010 Aadam 7 3672066 0.00000191 1.22
## 7 2015 Aadam 22 3648781 0.00000603 3.16
## 8 2010 Aadan 11 3672066 0.00000300 NA
## 9 2015 Aadan 10 3648781 0.00000274 0.915
## 10 2000 Aadarsh 5 3767293 0.00000133 NA
## # ... with 332,585 more rows
babynames_ratios_filtered <- babynames_fraction %>%
arrange(name, year) %>%
group_by(name) %>%
mutate(ratio = fraction / lag(fraction)) %>%
filter(fraction >= 0.00001)
babynames_ratios_filtered
## # A tibble: 104,344 x 6
## # Groups: name [14,807]
## year name number year_total fraction ratio
## <dbl> <chr> <int> <int> <dbl> <dbl>
## 1 2010 Aaden 450 3672066 0.000123 14.2
## 2 2015 Aaden 297 3648781 0.0000814 0.664
## 3 2015 Aadhya 265 3648781 0.0000726 14.0
## 4 2005 Aadi 51 3828460 0.0000133 NA
## 5 2010 Aadi 54 3672066 0.0000147 1.10
## 6 2015 Aadi 43 3648781 0.0000118 0.801
## 7 2010 Aaditya 37 3672066 0.0000101 1.48
## 8 2015 Aadya 159 3648781 0.0000436 4.85
## 9 2010 Aadyn 38 3672066 0.0000103 3.60
## 10 2010 Aahana 64 3672066 0.0000174 5.13
## # ... with 104,334 more rows
babynames_ratios_filtered %>%
# Extract the largest ratio from each name
top_n(1, ratio) %>%
# Sort the ratio column in descending order
arrange(desc(ratio)) %>%
# Filter for fractions greater than or equal to 0.001
filter(fraction >= 0.001)
## # A tibble: 291 x 6
## # Groups: name [291]
## year name number year_total fraction ratio
## <dbl> <chr> <int> <int> <dbl> <dbl>
## 1 1960 Tammy 14365 4152075 0.00346 70.1
## 2 2005 Nevaeh 4610 3828460 0.00120 45.8
## 3 1940 Brenda 5460 2301630 0.00237 37.5
## 4 1885 Grover 774 240822 0.00321 36.0
## 5 1945 Cheryl 8170 2652029 0.00308 24.9
## 6 1955 Lori 4980 4012691 0.00124 23.2
## 7 2010 Khloe 5411 3672066 0.00147 23.2
## 8 1950 Debra 6189 3502592 0.00177 22.6
## 9 2010 Bentley 4001 3672066 0.00109 22.4
## 10 1935 Marlene 4840 2088487 0.00232 16.8
## # ... with 281 more rows
Chapter 1 - How to Write a Function
Rationale for Using Functions:
Converting Scripts in to Functions:
Code Readability:
Example code includes:
gold_medals <- c(46, 27, 26, 19, 17, 12, 10, 9, 8, 8, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA)
names(gold_medals) <- c('USA', 'GBR', 'CHN', 'RUS', 'GER', 'JPN', 'FRA', 'KOR', 'ITA', 'AUS', 'NED', 'HUN', 'BRA', 'ESP', 'KEN', 'JAM', 'CRO', 'CUB', 'NZL', 'CAN', 'UZB', 'KAZ', 'COL', 'SUI', 'IRI', 'GRE', 'ARG', 'DEN', 'SWE', 'RSA', 'UKR', 'SRB', 'POL', 'PRK', 'BEL', 'THA', 'SVK', 'GEO', 'AZE', 'BLR', 'TUR', 'ARM', 'CZE', 'ETH', 'SLO', 'INA', 'ROU', 'BRN', 'VIE', 'TPE', 'BAH', 'IOA', 'CIV', 'FIJ', 'JOR', 'KOS', 'PUR', 'SIN', 'TJK', 'MAS', 'MEX', 'VEN', 'ALG', 'IRL', 'LTU', 'BUL', 'IND', 'MGL', 'BDI', 'GRN', 'NIG', 'PHI', 'QAT', 'NOR', 'EGY', 'TUN', 'ISR', 'AUT', 'DOM', 'EST', 'FIN', 'MAR', 'NGR', 'POR', 'TTO', 'UAE', 'IOC')
# Look at the gold medals data
gold_medals
## USA GBR CHN RUS GER JPN FRA KOR ITA AUS NED HUN BRA ESP KEN JAM CRO CUB NZL CAN
## 46 27 26 19 17 12 10 9 8 8 8 8 7 7 6 6 5 5 4 4
## UZB KAZ COL SUI IRI GRE ARG DEN SWE RSA UKR SRB POL PRK BEL THA SVK GEO AZE BLR
## 4 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 1 1
## TUR ARM CZE ETH SLO INA ROU BRN VIE TPE BAH IOA CIV FIJ JOR KOS PUR SIN TJK MAS
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
## MEX VEN ALG IRL LTU BUL IND MGL BDI GRN NIG PHI QAT NOR EGY TUN ISR AUT DOM EST
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## FIN MAR NGR POR TTO UAE IOC
## 0 0 0 0 0 0 NA
# Note the arguments to median()
args(median)
## function (x, na.rm = FALSE, ...)
## NULL
# Rewrite this function call, following best practices
median(gold_medals, na.rm=TRUE)
## [1] 1
# Note the arguments to rank()
args(rank)
## function (x, na.last = TRUE, ties.method = c("average", "first",
## "last", "random", "max", "min"), ...)
## NULL
# Rewrite this function call, following best practices
rank(-gold_medals, na.last="keep", ties.method = "min")
## USA GBR CHN RUS GER JPN FRA KOR ITA AUS NED HUN BRA ESP KEN JAM CRO CUB NZL CAN
## 1 2 3 4 5 6 7 8 9 9 9 9 13 13 15 15 17 17 19 19
## UZB KAZ COL SUI IRI GRE ARG DEN SWE RSA UKR SRB POL PRK BEL THA SVK GEO AZE BLR
## 19 22 22 22 22 22 22 28 28 28 28 28 28 28 28 28 28 28 39 39
## TUR ARM CZE ETH SLO INA ROU BRN VIE TPE BAH IOA CIV FIJ JOR KOS PUR SIN TJK MAS
## 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 60
## MEX VEN ALG IRL LTU BUL IND MGL BDI GRN NIG PHI QAT NOR EGY TUN ISR AUT DOM EST
## 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60
## FIN MAR NGR POR TTO UAE IOC
## 60 60 60 60 60 60 NA
coin_sides <- c("head", "tail")
# Sample from coin_sides once
sample(coin_sides, 1)
## [1] "head"
# Your functions, from previous steps
toss_coin <- function() {
coin_sides <- c("head", "tail")
sample(coin_sides, 1)
}
# Call your function
toss_coin()
## [1] "tail"
# Update the function to return n coin tosses
toss_coin <- function(n_flips) {
coin_sides <- c("head", "tail")
sample(coin_sides, n_flips, replace=TRUE)
}
# Generate 10 coin tosses
toss_coin(10)
## [1] "tail" "head" "tail" "tail" "tail" "tail" "tail" "head" "tail" "tail"
# Update the function so heads have probability p_head
toss_coin <- function(n_flips, p_head) {
coin_sides <- c("head", "tail")
# Define a vector of weights
weights <- c(p_head, 1-p_head)
# Modify the sampling to be weighted
sample(coin_sides, n_flips, replace = TRUE, prob=weights)
}
# Generate 10 coin tosses
toss_coin(10, p_head=0.8)
## [1] "head" "head" "head" "head" "head" "tail" "head" "head" "tail" "tail"
snake_river_visits <- readRDS("./RInputFiles/snake_river_visits.rds")
str(snake_river_visits)
## 'data.frame': 410 obs. of 4 variables:
## $ n_visits: num 0 0 0 0 0 0 0 0 0 0 ...
## $ gender : Factor w/ 2 levels "male","female": 1 1 1 2 1 2 2 2 1 1 ...
## $ income : Factor w/ 4 levels "[$0,$25k]","($25k,$55k]",..: 4 2 4 2 4 2 4 4 4 4 ...
## $ travel : Factor w/ 3 levels "[0h,0.25h]","(0.25h,4h]",..: NA NA NA NA NA NA NA NA NA NA ...
# Run a generalized linear regression
glm(
# Model no. of visits vs. gender, income, travel
n_visits ~ gender + income + travel,
# Use the snake_river_visits dataset
data = snake_river_visits,
# Make it a Poisson regression
family = poisson
)
##
## Call: glm(formula = n_visits ~ gender + income + travel, family = poisson,
## data = snake_river_visits)
##
## Coefficients:
## (Intercept) genderfemale income($25k,$55k] income($55k,$95k]
## 4.0864 0.3740 -0.0199 -0.5807
## income($95k,$Inf) travel(0.25h,4h] travel(4h,Infh)
## -0.5782 -0.6271 -2.4230
##
## Degrees of Freedom: 345 Total (i.e. Null); 339 Residual
## (64 observations deleted due to missingness)
## Null Deviance: 18900
## Residual Deviance: 11500 AIC: 12900
# From previous step
run_poisson_regression <- function(data, formula) {
glm(formula, data, family = poisson)
}
# Re-run the Poisson regression, using your function
model <- snake_river_visits %>%
run_poisson_regression(n_visits ~ gender + income + travel)
icLevels <- c("[$0,$25k]", "($25k,$55k]", "($55k,$95k]", "($95k,$Inf)")
trLevels <- c("[0h,0.25h]", "(0.25h,4h]", "(4h,Infh)")
srGender <- c("male", "female")[c(1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2)]
srIncome <- icLevels[c(1, 1, 2, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 3, 4, 4)]
srTravel <- trLevels[c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3)]
snake_river_explanatory <- data.frame(gender=factor(srGender, levels=c("male", "female")),
income=factor(srIncome, levels=icLevels),
travel=factor(srTravel, levels=trLevels)
)
str(snake_river_explanatory)
## 'data.frame': 24 obs. of 3 variables:
## $ gender: Factor w/ 2 levels "male","female": 1 2 1 2 1 2 1 2 1 2 ...
## $ income: Factor w/ 4 levels "[$0,$25k]","($25k,$55k]",..: 1 1 2 2 3 3 4 4 1 1 ...
## $ travel: Factor w/ 3 levels "[0h,0.25h]","(0.25h,4h]",..: 1 1 1 1 1 1 1 1 2 2 ...
# Run this to see the predictions
snake_river_explanatory %>%
mutate(predicted_n_visits = predict(model, ., type = "response"))%>%
arrange(desc(predicted_n_visits))
## gender income travel predicted_n_visits
## 1 female [$0,$25k] [0h,0.25h] 86.52
## 2 female ($25k,$55k] [0h,0.25h] 84.81
## 3 male [$0,$25k] [0h,0.25h] 59.52
## 4 male ($25k,$55k] [0h,0.25h] 58.35
## 5 female ($95k,$Inf) [0h,0.25h] 48.53
## 6 female ($55k,$95k] [0h,0.25h] 48.41
## 7 female [$0,$25k] (0.25h,4h] 46.21
## 8 female ($25k,$55k] (0.25h,4h] 45.30
## 9 male ($95k,$Inf) [0h,0.25h] 33.39
## 10 male ($55k,$95k] [0h,0.25h] 33.30
## 11 male [$0,$25k] (0.25h,4h] 31.79
## 12 male ($25k,$55k] (0.25h,4h] 31.17
## 13 female ($95k,$Inf) (0.25h,4h] 25.92
## 14 female ($55k,$95k] (0.25h,4h] 25.86
## 15 male ($95k,$Inf) (0.25h,4h] 17.83
## 16 male ($55k,$95k] (0.25h,4h] 17.79
## 17 female [$0,$25k] (4h,Infh) 7.67
## 18 female ($25k,$55k] (4h,Infh) 7.52
## 19 male [$0,$25k] (4h,Infh) 5.28
## 20 male ($25k,$55k] (4h,Infh) 5.17
## 21 female ($95k,$Inf) (4h,Infh) 4.30
## 22 female ($55k,$95k] (4h,Infh) 4.29
## 23 male ($95k,$Inf) (4h,Infh) 2.96
## 24 male ($55k,$95k] (4h,Infh) 2.95
Chapter 2 - Arguments
Default Arguments:
Passing Arguments Between Functions:
Checking Arguments:
Example code includes:
n_visits <- snake_river_visits$n_visits
summary(n_visits)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1 4 25 30 350
# Set the default for n to 5
cut_by_quantile <- function(x, n=5, na.rm, labels, interval_type) {
probs <- seq(0, 1, length.out = n + 1)
qtiles <- quantile(x, probs, na.rm = na.rm, names = FALSE)
right <- switch(interval_type, "(lo, hi]" = TRUE, "[lo, hi)" = FALSE)
cut(x, qtiles, labels = labels, right = right, include.lowest = TRUE)
}
# Remove the n argument from the call
cut_by_quantile(n_visits, na.rm = FALSE,
labels = c("very low", "low", "medium", "high", "very high"), interval_type = "(lo, hi]"
)
## [1] very low very low very low very low very low very low very low
## [8] very low very low very low very low very low very low very low
## [15] very low very low very low very low very low high very high
## [22] high very low medium low very low very low very low
## [29] very low very low very low very high very high very high very high
## [36] very high high very high very high very high very high very high
## [43] medium very high very high very high medium medium low
## [50] high high high very high very high high high
## [57] very high medium very high high medium high very high
## [64] very high very high very high high high very high high
## [71] very low very high high high medium high high
## [78] high medium very high very high very high high high
## [85] high very low very high medium high very high high
## [92] high very high high very low very low medium very low
## [99] medium medium very high medium medium medium high
## [106] low high very high medium very high medium very high
## [113] low very high low very high high very low very low
## [120] very low very low low very low very low very low very low
## [127] very low very low medium very low very low low low
## [134] very low very low low very low very low very low low
## [141] low medium medium medium medium medium very low
## [148] very low low very low low medium very low very low
## [155] very low very low very high high very high high medium
## [162] very high medium very low high medium high high
## [169] very high high high very high very high high very high
## [176] high high medium very high high high high
## [183] very high very high very low high very high high high
## [190] medium very high high very high high very high high
## [197] very high high very high very low high very high very high
## [204] very low very low medium very high medium low medium
## [211] high medium very low medium very high high very high
## [218] high very high high low high medium very high
## [225] medium high high high very low high high
## [232] high very high high medium medium very low very low
## [239] very low very low medium low very low very low very low
## [246] medium high very low very low medium very low very low
## [253] very low very low very low very low very low very low very low
## [260] very low very high medium very low very high medium very high
## [267] medium low very high medium medium medium low
## [274] high medium high very high medium very high very high
## [281] medium medium very high high medium very high high
## [288] medium low very low medium very low very low very low
## [295] very low very low low very low very low very low very low
## [302] very low very low very low very low low very low very low
## [309] very low very low low very low very low low very low
## [316] very low very low very low low very low very low very low
## [323] very low very low low very low very low very low very low
## [330] very low very low very low very low very low very low very low
## [337] very low very low very low very low very low very low very low
## [344] very low very low medium very low very low very low very low
## [351] very low very low very low very low very low very low very low
## [358] very low low very low very low very low very low very low
## [365] very low very low very low very low very low very low low
## [372] very low very low very low very high high very high very high
## [379] very high high very high very high very high very high medium
## [386] medium medium high very high high high high
## [393] high high high high very high high very high
## [400] medium high low high very high low very low
## [407] medium very low medium low
## Levels: very low low medium high very high
# Set the default for na.rm to FALSE
cut_by_quantile <- function(x, n = 5, na.rm=FALSE, labels, interval_type) {
probs <- seq(0, 1, length.out = n + 1)
qtiles <- quantile(x, probs, na.rm = na.rm, names = FALSE)
right <- switch(interval_type, "(lo, hi]" = TRUE, "[lo, hi)" = FALSE)
cut(x, qtiles, labels = labels, right = right, include.lowest = TRUE)
}
# Remove the na.rm argument from the call
cut_by_quantile(n_visits, labels = c("very low", "low", "medium", "high", "very high"),
interval_type = "(lo, hi]"
)
## [1] very low very low very low very low very low very low very low
## [8] very low very low very low very low very low very low very low
## [15] very low very low very low very low very low high very high
## [22] high very low medium low very low very low very low
## [29] very low very low very low very high very high very high very high
## [36] very high high very high very high very high very high very high
## [43] medium very high very high very high medium medium low
## [50] high high high very high very high high high
## [57] very high medium very high high medium high very high
## [64] very high very high very high high high very high high
## [71] very low very high high high medium high high
## [78] high medium very high very high very high high high
## [85] high very low very high medium high very high high
## [92] high very high high very low very low medium very low
## [99] medium medium very high medium medium medium high
## [106] low high very high medium very high medium very high
## [113] low very high low very high high very low very low
## [120] very low very low low very low very low very low very low
## [127] very low very low medium very low very low low low
## [134] very low very low low very low very low very low low
## [141] low medium medium medium medium medium very low
## [148] very low low very low low medium very low very low
## [155] very low very low very high high very high high medium
## [162] very high medium very low high medium high high
## [169] very high high high very high very high high very high
## [176] high high medium very high high high high
## [183] very high very high very low high very high high high
## [190] medium very high high very high high very high high
## [197] very high high very high very low high very high very high
## [204] very low very low medium very high medium low medium
## [211] high medium very low medium very high high very high
## [218] high very high high low high medium very high
## [225] medium high high high very low high high
## [232] high very high high medium medium very low very low
## [239] very low very low medium low very low very low very low
## [246] medium high very low very low medium very low very low
## [253] very low very low very low very low very low very low very low
## [260] very low very high medium very low very high medium very high
## [267] medium low very high medium medium medium low
## [274] high medium high very high medium very high very high
## [281] medium medium very high high medium very high high
## [288] medium low very low medium very low very low very low
## [295] very low very low low very low very low very low very low
## [302] very low very low very low very low low very low very low
## [309] very low very low low very low very low low very low
## [316] very low very low very low low very low very low very low
## [323] very low very low low very low very low very low very low
## [330] very low very low very low very low very low very low very low
## [337] very low very low very low very low very low very low very low
## [344] very low very low medium very low very low very low very low
## [351] very low very low very low very low very low very low very low
## [358] very low low very low very low very low very low very low
## [365] very low very low very low very low very low very low low
## [372] very low very low very low very high high very high very high
## [379] very high high very high very high very high very high medium
## [386] medium medium high very high high high high
## [393] high high high high very high high very high
## [400] medium high low high very high low very low
## [407] medium very low medium low
## Levels: very low low medium high very high
# Set the default for labels to NULL
cut_by_quantile <- function(x, n = 5, na.rm = FALSE, labels=NULL, interval_type) {
probs <- seq(0, 1, length.out = n + 1)
qtiles <- quantile(x, probs, na.rm = na.rm, names = FALSE)
right <- switch(interval_type, "(lo, hi]" = TRUE, "[lo, hi)" = FALSE)
cut(x, qtiles, labels = labels, right = right, include.lowest = TRUE)
}
# Remove the labels argument from the call
cut_by_quantile(n_visits, interval_type = "(lo, hi]")
## [1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [9] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [17] [0,1] [0,1] [0,1] (10,35] (35,350] (10,35] [0,1] (2,10]
## [25] (1,2] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] (35,350]
## [33] (35,350] (35,350] (35,350] (35,350] (10,35] (35,350] (35,350] (35,350]
## [41] (35,350] (35,350] (2,10] (35,350] (35,350] (35,350] (2,10] (2,10]
## [49] (1,2] (10,35] (10,35] (10,35] (35,350] (35,350] (10,35] (10,35]
## [57] (35,350] (2,10] (35,350] (10,35] (2,10] (10,35] (35,350] (35,350]
## [65] (35,350] (35,350] (10,35] (10,35] (35,350] (10,35] [0,1] (35,350]
## [73] (10,35] (10,35] (2,10] (10,35] (10,35] (10,35] (2,10] (35,350]
## [81] (35,350] (35,350] (10,35] (10,35] (10,35] [0,1] (35,350] (2,10]
## [89] (10,35] (35,350] (10,35] (10,35] (35,350] (10,35] [0,1] [0,1]
## [97] (2,10] [0,1] (2,10] (2,10] (35,350] (2,10] (2,10] (2,10]
## [105] (10,35] (1,2] (10,35] (35,350] (2,10] (35,350] (2,10] (35,350]
## [113] (1,2] (35,350] (1,2] (35,350] (10,35] [0,1] [0,1] [0,1]
## [121] [0,1] (1,2] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [129] (2,10] [0,1] [0,1] (1,2] (1,2] [0,1] [0,1] (1,2]
## [137] [0,1] [0,1] [0,1] (1,2] (1,2] (2,10] (2,10] (2,10]
## [145] (2,10] (2,10] [0,1] [0,1] (1,2] [0,1] (1,2] (2,10]
## [153] [0,1] [0,1] [0,1] [0,1] (35,350] (10,35] (35,350] (10,35]
## [161] (2,10] (35,350] (2,10] [0,1] (10,35] (2,10] (10,35] (10,35]
## [169] (35,350] (10,35] (10,35] (35,350] (35,350] (10,35] (35,350] (10,35]
## [177] (10,35] (2,10] (35,350] (10,35] (10,35] (10,35] (35,350] (35,350]
## [185] [0,1] (10,35] (35,350] (10,35] (10,35] (2,10] (35,350] (10,35]
## [193] (35,350] (10,35] (35,350] (10,35] (35,350] (10,35] (35,350] [0,1]
## [201] (10,35] (35,350] (35,350] [0,1] [0,1] (2,10] (35,350] (2,10]
## [209] (1,2] (2,10] (10,35] (2,10] [0,1] (2,10] (35,350] (10,35]
## [217] (35,350] (10,35] (35,350] (10,35] (1,2] (10,35] (2,10] (35,350]
## [225] (2,10] (10,35] (10,35] (10,35] [0,1] (10,35] (10,35] (10,35]
## [233] (35,350] (10,35] (2,10] (2,10] [0,1] [0,1] [0,1] [0,1]
## [241] (2,10] (1,2] [0,1] [0,1] [0,1] (2,10] (10,35] [0,1]
## [249] [0,1] (2,10] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [257] [0,1] [0,1] [0,1] [0,1] (35,350] (2,10] [0,1] (35,350]
## [265] (2,10] (35,350] (2,10] (1,2] (35,350] (2,10] (2,10] (2,10]
## [273] (1,2] (10,35] (2,10] (10,35] (35,350] (2,10] (35,350] (35,350]
## [281] (2,10] (2,10] (35,350] (10,35] (2,10] (35,350] (10,35] (2,10]
## [289] (1,2] [0,1] (2,10] [0,1] [0,1] [0,1] [0,1] [0,1]
## [297] (1,2] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [305] [0,1] (1,2] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1]
## [313] [0,1] (1,2] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1]
## [321] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1] [0,1] [0,1]
## [329] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [337] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [345] [0,1] (2,10] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [353] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1]
## [361] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [369] [0,1] [0,1] (1,2] [0,1] [0,1] [0,1] (35,350] (10,35]
## [377] (35,350] (35,350] (35,350] (10,35] (35,350] (35,350] (35,350] (35,350]
## [385] (2,10] (2,10] (2,10] (10,35] (35,350] (10,35] (10,35] (10,35]
## [393] (10,35] (10,35] (10,35] (10,35] (35,350] (10,35] (35,350] (2,10]
## [401] (10,35] (1,2] (10,35] (35,350] (1,2] [0,1] (2,10] [0,1]
## [409] (2,10] (1,2]
## Levels: [0,1] (1,2] (2,10] (10,35] (35,350]
# Set the categories for interval_type to "(lo, hi]" and "[lo, hi)"
cut_by_quantile <- function(x, n = 5, na.rm = FALSE, labels = NULL,
interval_type=c("(lo, hi]", "[lo, hi)")
) {
# Match the interval_type argument
interval_type <- match.arg(interval_type, c("(lo, hi]", "[lo, hi)"))
probs <- seq(0, 1, length.out = n + 1)
qtiles <- quantile(x, probs, na.rm = na.rm, names = FALSE)
right <- switch(interval_type, "(lo, hi]" = TRUE, "[lo, hi)" = FALSE)
cut(x, qtiles, labels = labels, right = right, include.lowest = TRUE)
}
# Remove the interval_type argument from the call
cut_by_quantile(n_visits)
## [1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [9] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [17] [0,1] [0,1] [0,1] (10,35] (35,350] (10,35] [0,1] (2,10]
## [25] (1,2] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] (35,350]
## [33] (35,350] (35,350] (35,350] (35,350] (10,35] (35,350] (35,350] (35,350]
## [41] (35,350] (35,350] (2,10] (35,350] (35,350] (35,350] (2,10] (2,10]
## [49] (1,2] (10,35] (10,35] (10,35] (35,350] (35,350] (10,35] (10,35]
## [57] (35,350] (2,10] (35,350] (10,35] (2,10] (10,35] (35,350] (35,350]
## [65] (35,350] (35,350] (10,35] (10,35] (35,350] (10,35] [0,1] (35,350]
## [73] (10,35] (10,35] (2,10] (10,35] (10,35] (10,35] (2,10] (35,350]
## [81] (35,350] (35,350] (10,35] (10,35] (10,35] [0,1] (35,350] (2,10]
## [89] (10,35] (35,350] (10,35] (10,35] (35,350] (10,35] [0,1] [0,1]
## [97] (2,10] [0,1] (2,10] (2,10] (35,350] (2,10] (2,10] (2,10]
## [105] (10,35] (1,2] (10,35] (35,350] (2,10] (35,350] (2,10] (35,350]
## [113] (1,2] (35,350] (1,2] (35,350] (10,35] [0,1] [0,1] [0,1]
## [121] [0,1] (1,2] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [129] (2,10] [0,1] [0,1] (1,2] (1,2] [0,1] [0,1] (1,2]
## [137] [0,1] [0,1] [0,1] (1,2] (1,2] (2,10] (2,10] (2,10]
## [145] (2,10] (2,10] [0,1] [0,1] (1,2] [0,1] (1,2] (2,10]
## [153] [0,1] [0,1] [0,1] [0,1] (35,350] (10,35] (35,350] (10,35]
## [161] (2,10] (35,350] (2,10] [0,1] (10,35] (2,10] (10,35] (10,35]
## [169] (35,350] (10,35] (10,35] (35,350] (35,350] (10,35] (35,350] (10,35]
## [177] (10,35] (2,10] (35,350] (10,35] (10,35] (10,35] (35,350] (35,350]
## [185] [0,1] (10,35] (35,350] (10,35] (10,35] (2,10] (35,350] (10,35]
## [193] (35,350] (10,35] (35,350] (10,35] (35,350] (10,35] (35,350] [0,1]
## [201] (10,35] (35,350] (35,350] [0,1] [0,1] (2,10] (35,350] (2,10]
## [209] (1,2] (2,10] (10,35] (2,10] [0,1] (2,10] (35,350] (10,35]
## [217] (35,350] (10,35] (35,350] (10,35] (1,2] (10,35] (2,10] (35,350]
## [225] (2,10] (10,35] (10,35] (10,35] [0,1] (10,35] (10,35] (10,35]
## [233] (35,350] (10,35] (2,10] (2,10] [0,1] [0,1] [0,1] [0,1]
## [241] (2,10] (1,2] [0,1] [0,1] [0,1] (2,10] (10,35] [0,1]
## [249] [0,1] (2,10] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [257] [0,1] [0,1] [0,1] [0,1] (35,350] (2,10] [0,1] (35,350]
## [265] (2,10] (35,350] (2,10] (1,2] (35,350] (2,10] (2,10] (2,10]
## [273] (1,2] (10,35] (2,10] (10,35] (35,350] (2,10] (35,350] (35,350]
## [281] (2,10] (2,10] (35,350] (10,35] (2,10] (35,350] (10,35] (2,10]
## [289] (1,2] [0,1] (2,10] [0,1] [0,1] [0,1] [0,1] [0,1]
## [297] (1,2] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [305] [0,1] (1,2] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1]
## [313] [0,1] (1,2] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1]
## [321] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1] [0,1] [0,1]
## [329] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [337] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [345] [0,1] (2,10] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [353] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1]
## [361] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [369] [0,1] [0,1] (1,2] [0,1] [0,1] [0,1] (35,350] (10,35]
## [377] (35,350] (35,350] (35,350] (10,35] (35,350] (35,350] (35,350] (35,350]
## [385] (2,10] (2,10] (2,10] (10,35] (35,350] (10,35] (10,35] (10,35]
## [393] (10,35] (10,35] (10,35] (10,35] (35,350] (10,35] (35,350] (2,10]
## [401] (10,35] (1,2] (10,35] (35,350] (1,2] [0,1] (2,10] [0,1]
## [409] (2,10] (1,2]
## Levels: [0,1] (1,2] (2,10] (10,35] (35,350]
std_and_poor500 <- readRDS("./RInputFiles/std_and_poor500_with_pe_2019-06-21.rds")
glimpse(std_and_poor500)
## Observations: 505
## Variables: 5
## $ symbol <chr> "MMM", "ABT", "ABBV", "ABMD", "ACN", "ATVI", "ADBE", "AMD"...
## $ company <chr> "3M Company", "Abbott Laboratories", "AbbVie Inc.", "ABIOM...
## $ sector <chr> "Industrials", "Health Care", "Health Care", "Health Care"...
## $ industry <chr> "Industrial Conglomerates", "Health Care Equipment", "Phar...
## $ pe_ratio <dbl> 18.32, 57.67, 22.44, 45.64, 27.00, 20.14, 55.94, 116.87, 2...
# From previous steps
get_reciprocal <- function(x) {
1 / x
}
calc_harmonic_mean <- function(x) {
x %>%
get_reciprocal() %>%
mean(na.rm=TRUE) %>%
get_reciprocal()
}
std_and_poor500 %>%
# Group by sector
group_by(sector) %>%
# Summarize, calculating harmonic mean of P/E ratio
summarize(hmean_pe_ratio = calc_harmonic_mean(pe_ratio))
## # A tibble: 11 x 2
## sector hmean_pe_ratio
## <chr> <dbl>
## 1 Communication Services 17.5
## 2 Consumer Discretionary 15.2
## 3 Consumer Staples 19.8
## 4 Energy 13.7
## 5 Financials 12.9
## 6 Health Care 26.6
## 7 Industrials 18.2
## 8 Information Technology 21.6
## 9 Materials 16.3
## 10 Real Estate 32.5
## 11 Utilities 23.9
# From previous step
calc_harmonic_mean <- function(x, na.rm = FALSE) {
x %>%
get_reciprocal() %>%
mean(na.rm = na.rm) %>%
get_reciprocal()
}
std_and_poor500 %>%
# Group by sector
group_by(sector) %>%
# Summarize, calculating harmonic mean of P/E ratio
summarize(hmean_pe_ratio = calc_harmonic_mean(pe_ratio, na.rm=TRUE))
## # A tibble: 11 x 2
## sector hmean_pe_ratio
## <chr> <dbl>
## 1 Communication Services 17.5
## 2 Consumer Discretionary 15.2
## 3 Consumer Staples 19.8
## 4 Energy 13.7
## 5 Financials 12.9
## 6 Health Care 26.6
## 7 Industrials 18.2
## 8 Information Technology 21.6
## 9 Materials 16.3
## 10 Real Estate 32.5
## 11 Utilities 23.9
calc_harmonic_mean <- function(x, ...) {
x %>%
get_reciprocal() %>%
mean(...) %>%
get_reciprocal()
}
std_and_poor500 %>%
# Group by sector
group_by(sector) %>%
# Summarize, calculating harmonic mean of P/E ratio
summarize(hmean_pe_ratio=calc_harmonic_mean(pe_ratio, na.rm=TRUE))
## # A tibble: 11 x 2
## sector hmean_pe_ratio
## <chr> <dbl>
## 1 Communication Services 17.5
## 2 Consumer Discretionary 15.2
## 3 Consumer Staples 19.8
## 4 Energy 13.7
## 5 Financials 12.9
## 6 Health Care 26.6
## 7 Industrials 18.2
## 8 Information Technology 21.6
## 9 Materials 16.3
## 10 Real Estate 32.5
## 11 Utilities 23.9
calc_harmonic_mean <- function(x, na.rm = FALSE) {
# Assert that x is numeric
assertive.types::assert_is_numeric(x)
x %>%
get_reciprocal() %>%
mean(na.rm = na.rm) %>%
get_reciprocal()
}
# See what happens when you pass it strings (bombs out, as it should)
# calc_harmonic_mean(std_and_poor500$sector)
calc_harmonic_mean <- function(x, na.rm = FALSE) {
assertive.types::assert_is_numeric(x)
# Check if any values of x are non-positive
if(any(assertive.numbers::is_non_positive(x), na.rm = TRUE)) {
# Throw an error
stop("x contains non-positive values, so the harmonic mean makes no sense.")
}
x %>%
get_reciprocal() %>%
mean(na.rm = na.rm) %>%
get_reciprocal()
}
# See what happens when you pass it negative numbers (bombs out as it should)
# calc_harmonic_mean(std_and_poor500$pe_ratio - 20)
# Update the function definition to fix the na.rm argument
calc_harmonic_mean <- function(x, na.rm = FALSE) {
assertive.types::assert_is_numeric(x)
if(any(assertive.numbers::is_non_positive(x), na.rm = TRUE)) {
stop("x contains non-positive values, so the harmonic mean makes no sense.")
}
# Use the first value of na.rm, and coerce to logical
na.rm <- assertive.base::coerce_to(assertive.base::use_first(na.rm), "logical")
x %>%
get_reciprocal() %>%
mean(na.rm = na.rm) %>%
get_reciprocal()
}
# See what happens when you pass it malformed na.rm
calc_harmonic_mean(std_and_poor500$pe_ratio, na.rm = 1:5)
## Warning: Only the first value of na.rm (= 1) will be used.
## Warning: Coercing assertive.base::use_first(na.rm) to class 'logical'.
## [1] 18.2
Chapter 3 - Return Values and Scope
Returning Values from Functions:
Returning Multiple Values from Functions:
Environments:
Scope and Precedence:
Example code includes:
is_leap_year <- function(year) {
# If year is div. by 400 return TRUE
if(year %% 400 == 0) {
return(TRUE)
}
# If year is div. by 100 return FALSE
if(year %% 100 == 0) {
return(FALSE)
}
# If year is div. by 4 return TRUE
if(year %% 4 == 0) {
return(TRUE)
}
# Otherwise return FALSE
return(FALSE)
}
cars <- data.frame(speed=c(4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20, 20, 22, 23, 24, 24, 24, 24, 25),
dist=c(2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, 28, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40, 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32, 48, 52, 56, 64, 66, 54, 70, 92, 93, 120, 85)
)
str(cars)
## 'data.frame': 50 obs. of 2 variables:
## $ speed: num 4 4 7 7 8 9 10 10 10 11 ...
## $ dist : num 2 10 4 22 16 10 18 26 34 17 ...
# Using cars, draw a scatter plot of dist vs. speed
plt_dist_vs_speed <- plot(dist ~ speed, data = cars)
# Oh no! The plot object is NULL
plt_dist_vs_speed
## NULL
# Define a scatter plot fn with data and formula args
pipeable_plot <- function(data, formula) {
# Call plot() with the formula interface
plot(formula, data)
# Invisibly return the input dataset
invisible(data)
}
# Draw the scatter plot of dist vs. speed again
plt_dist_vs_speed <- cars %>%
pipeable_plot(dist ~ speed)
# Now the plot object has a value
plt_dist_vs_speed
## speed dist
## 1 4 2
## 2 4 10
## 3 7 4
## 4 7 22
## 5 8 16
## 6 9 10
## 7 10 18
## 8 10 26
## 9 10 34
## 10 11 17
## 11 11 28
## 12 12 14
## 13 12 20
## 14 12 24
## 15 12 28
## 16 13 26
## 17 13 34
## 18 13 34
## 19 13 46
## 20 14 26
## 21 14 36
## 22 14 60
## 23 14 80
## 24 15 20
## 25 15 26
## 26 15 54
## 27 16 32
## 28 16 40
## 29 17 32
## 30 17 40
## 31 17 50
## 32 18 42
## 33 18 56
## 34 18 76
## 35 18 84
## 36 19 36
## 37 19 46
## 38 19 68
## 39 20 32
## 40 20 48
## 41 20 52
## 42 20 56
## 43 20 64
## 44 22 66
## 45 23 54
## 46 24 70
## 47 24 92
## 48 24 93
## 49 24 120
## 50 25 85
# Look at the structure of model (it's a mess!)
str(model)
## List of 31
## $ coefficients : Named num [1:7] 4.0864 0.374 -0.0199 -0.5807 -0.5782 ...
## ..- attr(*, "names")= chr [1:7] "(Intercept)" "genderfemale" "income($25k,$55k]" "income($55k,$95k]" ...
## $ residuals : Named num [1:346] -0.535 -0.768 -0.944 -0.662 -0.767 ...
## ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
## $ fitted.values : Named num [1:346] 4.3 4.3 17.83 2.96 4.29 ...
## ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
## $ effects : Named num [1:346] -360 -29.2 20.3 -10 23.4 ...
## ..- attr(*, "names")= chr [1:346] "(Intercept)" "genderfemale" "income($25k,$55k]" "income($55k,$95k]" ...
## $ R : num [1:7, 1:7] -97.4 0 0 0 0 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7] "(Intercept)" "genderfemale" "income($25k,$55k]" "income($55k,$95k]" ...
## .. ..$ : chr [1:7] "(Intercept)" "genderfemale" "income($25k,$55k]" "income($55k,$95k]" ...
## $ rank : int 7
## $ qr :List of 5
## ..$ qr : num [1:346, 1:7] -97.3861 0.0213 0.0434 0.0177 0.0213 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:346] "25" "26" "27" "29" ...
## .. .. ..$ : chr [1:7] "(Intercept)" "genderfemale" "income($25k,$55k]" "income($55k,$95k]" ...
## ..$ rank : int 7
## ..$ qraux: num [1:7] 1.02 1.02 1.04 1.01 1 ...
## ..$ pivot: int [1:7] 1 2 3 4 5 6 7
## ..$ tol : num 1e-11
## ..- attr(*, "class")= chr "qr"
## $ family :List of 12
## ..$ family : chr "poisson"
## ..$ link : chr "log"
## ..$ linkfun :function (mu)
## ..$ linkinv :function (eta)
## ..$ variance :function (mu)
## ..$ dev.resids:function (y, mu, wt)
## ..$ aic :function (y, n, mu, wt, dev)
## ..$ mu.eta :function (eta)
## ..$ initialize: expression({ if (any(y < 0)) stop("negative values not allowed for the 'Poisson' family") n <- rep.int(1, nobs| __truncated__
## ..$ validmu :function (mu)
## ..$ valideta :function (eta)
## ..$ simulate :function (object, nsim)
## ..- attr(*, "class")= chr "family"
## $ linear.predictors: Named num [1:346] 1.46 1.46 2.88 1.09 1.46 ...
## ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
## $ deviance : num 11529
## $ aic : num 12864
## $ null.deviance : num 18850
## $ iter : int 6
## $ weights : Named num [1:346] 4.3 4.3 17.83 2.96 4.29 ...
## ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
## $ prior.weights : Named num [1:346] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
## $ df.residual : int 339
## $ df.null : int 345
## $ y : Named num [1:346] 2 1 1 1 1 1 80 104 55 350 ...
## ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
## $ converged : logi TRUE
## $ boundary : logi FALSE
## $ model :'data.frame': 346 obs. of 4 variables:
## ..$ n_visits: num [1:346] 2 1 1 1 1 1 80 104 55 350 ...
## ..$ gender : Factor w/ 2 levels "male","female": 2 2 1 1 2 1 2 2 1 2 ...
## ..$ income : Factor w/ 4 levels "[$0,$25k]","($25k,$55k]",..: 4 4 4 4 3 1 1 4 2 2 ...
## ..$ travel : Factor w/ 3 levels "[0h,0.25h]","(0.25h,4h]",..: 3 3 2 3 3 1 1 1 2 1 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language n_visits ~ gender + income + travel
## .. .. ..- attr(*, "variables")= language list(n_visits, gender, income, travel)
## .. .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:4] "n_visits" "gender" "income" "travel"
## .. .. .. .. ..$ : chr [1:3] "gender" "income" "travel"
## .. .. ..- attr(*, "term.labels")= chr [1:3] "gender" "income" "travel"
## .. .. ..- attr(*, "order")= int [1:3] 1 1 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: 0x000000007b919638>
## .. .. ..- attr(*, "predvars")= language list(n_visits, gender, income, travel)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "factor" "factor" "factor"
## .. .. .. ..- attr(*, "names")= chr [1:4] "n_visits" "gender" "income" "travel"
## ..- attr(*, "na.action")= 'omit' Named int [1:64] 1 2 3 4 5 6 7 8 9 10 ...
## .. ..- attr(*, "names")= chr [1:64] "1" "2" "3" "4" ...
## $ na.action : 'omit' Named int [1:64] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "names")= chr [1:64] "1" "2" "3" "4" ...
## $ call : language glm(formula = formula, family = poisson, data = data)
## $ formula :Class 'formula' language n_visits ~ gender + income + travel
## .. ..- attr(*, ".Environment")=<environment: 0x000000007b919638>
## $ terms :Classes 'terms', 'formula' language n_visits ~ gender + income + travel
## .. ..- attr(*, "variables")= language list(n_visits, gender, income, travel)
## .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:4] "n_visits" "gender" "income" "travel"
## .. .. .. ..$ : chr [1:3] "gender" "income" "travel"
## .. ..- attr(*, "term.labels")= chr [1:3] "gender" "income" "travel"
## .. ..- attr(*, "order")= int [1:3] 1 1 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: 0x000000007b919638>
## .. ..- attr(*, "predvars")= language list(n_visits, gender, income, travel)
## .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "factor" "factor" "factor"
## .. .. ..- attr(*, "names")= chr [1:4] "n_visits" "gender" "income" "travel"
## $ data :'data.frame': 410 obs. of 4 variables:
## ..$ n_visits: num [1:410] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ gender : Factor w/ 2 levels "male","female": 1 1 1 2 1 2 2 2 1 1 ...
## ..$ income : Factor w/ 4 levels "[$0,$25k]","($25k,$55k]",..: 4 2 4 2 4 2 4 4 4 4 ...
## ..$ travel : Factor w/ 3 levels "[0h,0.25h]","(0.25h,4h]",..: NA NA NA NA NA NA NA NA NA NA ...
## $ offset : NULL
## $ control :List of 3
## ..$ epsilon: num 1e-08
## ..$ maxit : num 25
## ..$ trace : logi FALSE
## $ method : chr "glm.fit"
## $ contrasts :List of 3
## ..$ gender: chr "contr.treatment"
## ..$ income: chr "contr.treatment"
## ..$ travel: chr "contr.treatment"
## $ xlevels :List of 3
## ..$ gender: chr [1:2] "male" "female"
## ..$ income: chr [1:4] "[$0,$25k]" "($25k,$55k]" "($55k,$95k]" "($95k,$Inf)"
## ..$ travel: chr [1:3] "[0h,0.25h]" "(0.25h,4h]" "(4h,Infh)"
## - attr(*, "class")= chr [1:2] "glm" "lm"
# Use broom tools to get a list of 3 data frames
list(
# Get model-level values
model = broom::glance(model),
# Get coefficient-level values
coefficients = broom::tidy(model),
# Get observation-level values
observations = broom::augment(model)
)
## $model
## # A tibble: 1 x 7
## null.deviance df.null logLik AIC BIC deviance df.residual
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
## 1 18850. 345 -6425. 12864. 12891. 11529. 339
##
## $coefficients
## # A tibble: 7 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4.09 0.0279 146. 0.
## 2 genderfemale 0.374 0.0212 17.6 2.18e- 69
## 3 income($25k,$55k] -0.0199 0.0267 -0.746 4.56e- 1
## 4 income($55k,$95k] -0.581 0.0343 -16.9 3.28e- 64
## 5 income($95k,$Inf) -0.578 0.0310 -18.7 6.88e- 78
## 6 travel(0.25h,4h] -0.627 0.0217 -28.8 5.40e-183
## 7 travel(4h,Infh) -2.42 0.0492 -49.3 0.
##
## $observations
## # A tibble: 346 x 12
## .rownames n_visits gender income travel .fitted .se.fit .resid .hat
## <chr> <dbl> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 25 2 female ($95k~ (4h,I~ 1.46 0.0502 -1.24 0.0109
## 2 26 1 female ($95k~ (4h,I~ 1.46 0.0502 -1.92 0.0109
## 3 27 1 male ($95k~ (0.25~ 2.88 0.0269 -5.28 0.0129
## 4 29 1 male ($95k~ (4h,I~ 1.09 0.0490 -1.32 0.00711
## 5 30 1 female ($55k~ (4h,I~ 1.46 0.0531 -1.92 0.0121
## 6 31 1 male [$0,$~ [0h,0~ 4.09 0.0279 -10.4 0.0465
## 7 33 80 female [$0,$~ [0h,0~ 4.46 0.0235 -0.710 0.0479
## 8 34 104 female ($95k~ [0h,0~ 3.88 0.0261 6.90 0.0332
## 9 35 55 male ($25k~ (0.25~ 3.44 0.0222 3.85 0.0153
## 10 36 350 female ($25k~ [0h,0~ 4.44 0.0206 21.5 0.0360
## # ... with 336 more rows, and 3 more variables: .sigma <dbl>, .cooksd <dbl>,
## # .std.resid <dbl>
# From previous step
groom_model <- function(model) {
list(
model = broom::glance(model),
coefficients = broom::tidy(model),
observations = broom::augment(model)
)
}
library(zeallot) # needed for %<-%
##
## Attaching package: 'zeallot'
## The following objects are masked from 'package:igraph':
##
## %->%, %<-%
# Call groom_model on model, assigning to 3 variables
c(mdl, cff, obs) %<-% groom_model(model)
# See these individual variables
mdl; cff; obs
## # A tibble: 1 x 7
## null.deviance df.null logLik AIC BIC deviance df.residual
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
## 1 18850. 345 -6425. 12864. 12891. 11529. 339
## # A tibble: 7 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4.09 0.0279 146. 0.
## 2 genderfemale 0.374 0.0212 17.6 2.18e- 69
## 3 income($25k,$55k] -0.0199 0.0267 -0.746 4.56e- 1
## 4 income($55k,$95k] -0.581 0.0343 -16.9 3.28e- 64
## 5 income($95k,$Inf) -0.578 0.0310 -18.7 6.88e- 78
## 6 travel(0.25h,4h] -0.627 0.0217 -28.8 5.40e-183
## 7 travel(4h,Infh) -2.42 0.0492 -49.3 0.
## # A tibble: 346 x 12
## .rownames n_visits gender income travel .fitted .se.fit .resid .hat
## <chr> <dbl> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 25 2 female ($95k~ (4h,I~ 1.46 0.0502 -1.24 0.0109
## 2 26 1 female ($95k~ (4h,I~ 1.46 0.0502 -1.92 0.0109
## 3 27 1 male ($95k~ (0.25~ 2.88 0.0269 -5.28 0.0129
## 4 29 1 male ($95k~ (4h,I~ 1.09 0.0490 -1.32 0.00711
## 5 30 1 female ($55k~ (4h,I~ 1.46 0.0531 -1.92 0.0121
## 6 31 1 male [$0,$~ [0h,0~ 4.09 0.0279 -10.4 0.0465
## 7 33 80 female [$0,$~ [0h,0~ 4.46 0.0235 -0.710 0.0479
## 8 34 104 female ($95k~ [0h,0~ 3.88 0.0261 6.90 0.0332
## 9 35 55 male ($25k~ (0.25~ 3.44 0.0222 3.85 0.0153
## 10 36 350 female ($25k~ [0h,0~ 4.44 0.0206 21.5 0.0360
## # ... with 336 more rows, and 3 more variables: .sigma <dbl>, .cooksd <dbl>,
## # .std.resid <dbl>
pipeable_plot <- function(data, formula) {
plot(formula, data)
# Add a "formula" attribute to data
attr(data, "formula") <- formula
invisible(data)
}
# From previous exercise
plt_dist_vs_speed <- cars %>%
pipeable_plot(dist ~ speed)
# Examine the structure of the result
str(plt_dist_vs_speed)
## 'data.frame': 50 obs. of 2 variables:
## $ speed: num 4 4 7 7 8 9 10 10 10 11 ...
## $ dist : num 2 10 4 22 16 10 18 26 34 17 ...
## - attr(*, "formula")=Class 'formula' language dist ~ speed
## .. ..- attr(*, ".Environment")=<environment: 0x000000005e7c6518>
capitals <- tibble::tibble(city=c("Cape Town", "Bloemfontein", "Pretoria"),
type_of_capital=c("Legislative", "Judicial", "Administrative")
)
national_parks <- c('Addo Elephant National Park', 'Agulhas National Park', 'Ai-Ais/Richtersveld Transfrontier Park', 'Augrabies Falls National Park', 'Bontebok National Park', 'Camdeboo National Park', 'Golden Gate Highlands National Park', 'Hluhluwe–Imfolozi Park', 'Karoo National Park', 'Kgalagadi Transfrontier Park', 'Knysna National Lake Area', 'Kruger National Park', 'Mapungubwe National Park', 'Marakele National Park', 'Mokala National Park', 'Mountain Zebra National Park', 'Namaqua National Park', 'Table Mountain National Park', 'Tankwa Karoo National Park', 'Tsitsikamma National Park', 'West Coast National Park', 'Wilderness National Park')
population <- ts(c(40583573, 44819778, 47390900, 51770560, 55908900), start=1996, end=2016, deltat=5)
capitals
## # A tibble: 3 x 2
## city type_of_capital
## <chr> <chr>
## 1 Cape Town Legislative
## 2 Bloemfontein Judicial
## 3 Pretoria Administrative
national_parks
## [1] "Addo Elephant National Park"
## [2] "Agulhas National Park"
## [3] "Ai-Ais/Richtersveld Transfrontier Park"
## [4] "Augrabies Falls National Park"
## [5] "Bontebok National Park"
## [6] "Camdeboo National Park"
## [7] "Golden Gate Highlands National Park"
## [8] "Hluhluwe–Imfolozi Park"
## [9] "Karoo National Park"
## [10] "Kgalagadi Transfrontier Park"
## [11] "Knysna National Lake Area"
## [12] "Kruger National Park"
## [13] "Mapungubwe National Park"
## [14] "Marakele National Park"
## [15] "Mokala National Park"
## [16] "Mountain Zebra National Park"
## [17] "Namaqua National Park"
## [18] "Table Mountain National Park"
## [19] "Tankwa Karoo National Park"
## [20] "Tsitsikamma National Park"
## [21] "West Coast National Park"
## [22] "Wilderness National Park"
population
## Time Series:
## Start = 1996
## End = 2016
## Frequency = 0.2
## [1] 40583573 44819778 47390900 51770560 55908900
# From previous steps
rsa_lst <- list(
capitals = capitals,
national_parks = national_parks,
population = population
)
rsa_env <- list2env(rsa_lst)
ls.str(rsa_lst)
## capitals : Classes 'tbl_df', 'tbl' and 'data.frame': 3 obs. of 2 variables:
## $ city : chr "Cape Town" "Bloemfontein" "Pretoria"
## $ type_of_capital: chr "Legislative" "Judicial" "Administrative"
## national_parks : chr [1:22] "Addo Elephant National Park" "Agulhas National Park" ...
## population : Time-Series [1:5] from 1996 to 2016: 40583573 44819778 47390900 51770560 55908900
ls.str(rsa_env)
## capitals : Classes 'tbl_df', 'tbl' and 'data.frame': 3 obs. of 2 variables:
## $ city : chr "Cape Town" "Bloemfontein" "Pretoria"
## $ type_of_capital: chr "Legislative" "Judicial" "Administrative"
## national_parks : chr [1:22] "Addo Elephant National Park" "Agulhas National Park" ...
## population : Time-Series [1:5] from 1996 to 2016: 40583573 44819778 47390900 51770560 55908900
# Find the parent environment of rsa_env
parent <- parent.env(rsa_env)
# Print its name
environmentName(parent)
## [1] "R_GlobalEnv"
# Compare the contents of the global environment and rsa_env
# ls.str(globalenv())
ls.str(rsa_env)
## capitals : Classes 'tbl_df', 'tbl' and 'data.frame': 3 obs. of 2 variables:
## $ city : chr "Cape Town" "Bloemfontein" "Pretoria"
## $ type_of_capital: chr "Legislative" "Judicial" "Administrative"
## national_parks : chr [1:22] "Addo Elephant National Park" "Agulhas National Park" ...
## population : Time-Series [1:5] from 1996 to 2016: 40583573 44819778 47390900 51770560 55908900
# Does population exist in rsa_env?
exists("population", envir = rsa_env)
## [1] TRUE
# Does population exist in rsa_env, ignoring inheritance?
exists("population", envir = rsa_env, inherits=FALSE)
## [1] TRUE
Chapter 4 - Case Study on Grain Yields
Grain Yields and Conversion:
Visualizing Grain Yields:
Modeling Grain Yields:
Wrap Up:
Example code includes:
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
corn <- readRDS("./RInputFiles/nass.corn.rds")
wheat <- readRDS("./RInputFiles/nass.wheat.rds")
barley <- readRDS("./RInputFiles/nass.barley.rds")
corn <- as_tibble(corn)
wheat <- as_tibble(wheat)
barley <- as_tibble(barley)
str(corn)
## Classes 'tbl_df', 'tbl' and 'data.frame': 6381 obs. of 4 variables:
## $ year : int 1866 1866 1866 1866 1866 1866 1866 1866 1866 1866 ...
## $ state : chr "Alabama" "Arkansas" "California" "Connecticut" ...
## $ farmed_area_acres : num 1050000 280000 42000 57000 200000 ...
## $ yield_bushels_per_acre: num 9 18 28 34 23 9 6 29 36.5 32 ...
str(wheat)
## Classes 'tbl_df', 'tbl' and 'data.frame': 5963 obs. of 4 variables:
## $ year : int 1866 1866 1866 1866 1866 1866 1866 1866 1866 1866 ...
## $ state : chr "Alabama" "Arkansas" "California" "Connecticut" ...
## $ farmed_area_acres : num 125000 50000 650000 2000 59000 245000 2300000 1550000 1190000 68000 ...
## $ yield_bushels_per_acre: num 5 6.5 18 17.5 11 4 10.5 10 13 19 ...
str(barley)
## Classes 'tbl_df', 'tbl' and 'data.frame': 4839 obs. of 4 variables:
## $ year : int 1866 1866 1866 1866 1866 1866 1866 1866 1866 1866 ...
## $ state : chr "Connecticut" "Illinois" "Indiana" "Iowa" ...
## $ farmed_area_acres : num 1000 96000 11000 66000 2000 10000 34000 7000 21000 20000 ...
## $ yield_bushels_per_acre: num 22.5 23.4 23 22 23 23.5 21.5 25.5 26 26 ...
# Write a function to convert acres to sq. yards
acres_to_sq_yards <- function(acres) {
acres * 4840
}
# Write a function to convert yards to meters
yards_to_meters <- function(yards) {
yards * 36 * 0.0254
}
# Write a function to convert sq. meters to hectares
sq_meters_to_hectares <- function(sq_meters) {
sq_meters / 10000
}
# Write a function to convert sq. yards to sq. meters
sq_yards_to_sq_meters <- function(sq_yards) {
sq_yards %>%
# Take the square root
sqrt() %>%
# Convert yards to meters
yards_to_meters() %>%
# Square it
raise_to_power(2)
}
# Write a function to convert acres to hectares
acres_to_hectares <- function(acres) {
acres %>%
# Convert acres to sq yards
acres_to_sq_yards() %>%
# Convert sq yards to sq meters
sqrt() %>%
yards_to_meters() %>%
raise_to_power(2) %>%
# Convert sq meters to hectares
sq_meters_to_hectares()
}
# Write a function to convert lb to kg
lbs_to_kgs <- function(lbs) {
lbs * 0.45359237
}
# Write a function to convert bushels to lbs
bushels_to_lbs <- function(bushels, crop) {
# Define a lookup table of scale factors
c(barley = 48, corn = 56, wheat = 60, volume = 8) %>%
# Extract the value for the crop
extract(crop) %>%
# Multiply by the no. of bushels
multiply_by(bushels)
}
# Write a function to convert bushels to kg
bushels_to_kgs <- function(bushels, crop) {
bushels %>%
# Convert bushels to lbs
bushels_to_lbs(crop) %>%
# Convert lbs to kgs
lbs_to_kgs()
}
# Write a function to convert bushels/acre to kg/ha
bushels_per_acre_to_kgs_per_hectare <- function(bushels_per_acre, crop = c("barley", "corn", "wheat")) {
# Match the crop argument
crop <- match.arg(crop)
bushels_per_acre %>%
# Convert bushels to kgs
bushels_to_kgs(crop) %>%
# Convert acres to ha
acres_to_hectares()
}
# View the corn dataset
glimpse(corn)
## Observations: 6,381
## Variables: 4
## $ year <int> 1866, 1866, 1866, 1866, 1866, 1866, 1866, 18...
## $ state <chr> "Alabama", "Arkansas", "California", "Connec...
## $ farmed_area_acres <dbl> 1050000, 280000, 42000, 57000, 200000, 12500...
## $ yield_bushels_per_acre <dbl> 9.0, 18.0, 28.0, 34.0, 23.0, 9.0, 6.0, 29.0,...
corn <- corn %>%
# Add some columns
mutate(
# Convert farmed area from acres to ha
farmed_area_ha = acres_to_hectares(farmed_area_acres),
# Convert yield from bushels/acre to kg/ha
yield_kg_per_ha = bushels_per_acre_to_kgs_per_hectare(yield_bushels_per_acre, crop = "corn")
)
# Wrap this code into a function
fortify_with_metric_units <- function(data, crop) {
data %>%
mutate(
farmed_area_ha = acres_to_hectares(farmed_area_acres),
yield_kg_per_ha = bushels_per_acre_to_kgs_per_hectare(yield_bushels_per_acre, crop = crop)
)
}
# Try it on the wheat dataset
wheat <- fortify_with_metric_units(wheat, "wheat")
# Using corn, plot yield (kg/ha) vs. year
ggplot(corn, aes(x=year, y=yield_kg_per_ha)) +
# Add a line layer, grouped by state
geom_line(aes(group = state)) +
# Add a smooth trend layer
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Wrap this plotting code into a function
plot_yield_vs_year <- function(data) {
ggplot(data, aes(year, yield_kg_per_ha)) +
geom_line(aes(group = state)) +
geom_smooth()
}
# Test it on the wheat dataset
plot_yield_vs_year(wheat)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
usa_census_regions <- tibble::tibble(census_region=c('New England', 'New England', 'New England', 'New England', 'New England', 'New England', 'Mid-Atlantic', 'Mid-Atlantic', 'Mid-Atlantic', 'East North Central', 'East North Central', 'East North Central', 'East North Central', 'East North Central', 'West North Central', 'West North Central', 'West North Central', 'West North Central', 'West North Central', 'West North Central', 'West North Central', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'East South Central', 'East South Central', 'East South Central', 'East South Central', 'West South Central', 'West South Central', 'West South Central', 'West South Central', 'Mountain', 'Mountain', 'Mountain', 'Mountain', 'Mountain', 'Mountain', 'Mountain', 'Mountain', 'Pacific', 'Pacific', 'Pacific', 'Pacific', 'Pacific'),
state=c('Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'Vermont', 'New Jersey', 'New York', 'Pennsylvania', 'Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin', 'Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota', 'South Dakota', 'Delaware', 'Florida', 'Georgia', 'Maryland', 'North Carolina', 'South Carolina', 'Virginia', 'District of Columbia', 'West Virginia', 'Alabama', 'Kentucky', 'Mississippi', 'Tennessee', 'Arkansas', 'Louisiana', 'Oklahoma', 'Texas', 'Arizona', 'Colorado', 'Idaho', 'Montana', 'Nevada', 'New Mexico', 'Utah', 'Wyoming', 'Alaska', 'California', 'Hawaii', 'Oregon', 'Washington')
)
usa_census_regions
## # A tibble: 51 x 2
## census_region state
## <chr> <chr>
## 1 New England Connecticut
## 2 New England Maine
## 3 New England Massachusetts
## 4 New England New Hampshire
## 5 New England Rhode Island
## 6 New England Vermont
## 7 Mid-Atlantic New Jersey
## 8 Mid-Atlantic New York
## 9 Mid-Atlantic Pennsylvania
## 10 East North Central Illinois
## # ... with 41 more rows
# Inner join the corn dataset to usa_census_regions by state
corn <- corn %>%
inner_join(usa_census_regions, by = "state")
# Wrap this code into a function
fortify_with_census_region <- function(data) {
data %>%
inner_join(usa_census_regions, by = "state")
}
# Try it on the wheat dataset
wheat <- fortify_with_census_region(wheat)
# Plot yield vs. year for the corn dataset
plot_yield_vs_year(corn) +
facet_wrap(~census_region)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Wrap this code into a function
plot_yield_vs_year_by_region <- function(data) {
plot_yield_vs_year(data) +
facet_wrap(vars(census_region))
}
# Try it on the wheat dataset
plot_yield_vs_year_by_region(wheat)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Wrap the model code into a function
run_gam_yield_vs_year_by_region <- function(data) {
mgcv::gam(yield_kg_per_ha ~ s(year) + census_region, data = data)
}
# Try it on the wheat dataset
wheat_model <- run_gam_yield_vs_year_by_region(wheat)
corn_model <- run_gam_yield_vs_year_by_region(wheat)
# Make predictions in 2050
predict_this <- data.frame(year = 2050, census_region = unique(usa_census_regions$census_region))
# Predict the yield
pred_yield_kg_per_ha <- predict(corn_model, predict_this, type = "response")
predict_this %>%
# Add the prediction as a column of predict_this
mutate(pred_yield_kg_per_ha = pred_yield_kg_per_ha)
## year census_region pred_yield_kg_per_ha
## 1 2050 New England 902
## 2 2050 Mid-Atlantic 889
## 3 2050 East North Central 896
## 4 2050 West North Central 816
## 5 2050 South Atlantic 832
## 6 2050 East South Central 817
## 7 2050 West South Central 780
## 8 2050 Mountain 894
## 9 2050 Pacific 935
# Wrap this prediction code into a function
predict_yields <- function(model, year) {
predict_this <- data.frame(year = year, census_region = unique(usa_census_regions$census_region))
pred_yield_kg_per_ha <- predict(model, predict_this, type = "response")
predict_this %>%
mutate(pred_yield_kg_per_ha = pred_yield_kg_per_ha)
}
# Try it on the wheat dataset
predict_yields(wheat_model, year=2050)
## year census_region pred_yield_kg_per_ha
## 1 2050 New England 902
## 2 2050 Mid-Atlantic 889
## 3 2050 East North Central 896
## 4 2050 West North Central 816
## 5 2050 South Atlantic 832
## 6 2050 East South Central 817
## 7 2050 West South Central 780
## 8 2050 Mountain 894
## 9 2050 Pacific 935
# From previous step
fortified_barley <- barley %>%
fortify_with_metric_units(crop="barley") %>%
fortify_with_census_region()
# Plot yield vs. year by region
plot_yield_vs_year_by_region(fortified_barley)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
fortified_barley %>%
# Run a GAM of yield vs. year by region
run_gam_yield_vs_year_by_region() %>%
# Make predictions of yields in 2050
predict_yields(year=2050)
## year census_region pred_yield_kg_per_ha
## 1 2050 New England 693
## 2 2050 Mid-Atlantic 696
## 3 2050 East North Central 690
## 4 2050 West North Central 630
## 5 2050 South Atlantic 696
## 6 2050 East South Central 658
## 7 2050 West South Central 596
## 8 2050 Mountain 760
## 9 2050 Pacific 699
Chapter 1 - Introduction
Introduction:
Grammar of Graphics:
Layers:
Example code includes:
data(mtcars)
# Explore the mtcars data frame with str()
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# Execute the following command
ggplot(mtcars, aes(cyl, mpg)) +
geom_point()
# Change the command below so that cyl is treated as factor
ggplot(mtcars, aes(factor(cyl), mpg)) +
geom_point()
# Edit to add a color aesthetic mapped to disp
ggplot(mtcars, aes(wt, mpg, color=disp)) +
geom_point()
# Change the color aesthetic to a size aesthetic
ggplot(mtcars, aes(wt, mpg, size = disp)) +
geom_point()
data(diamonds)
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
# Add geom_point() with +
ggplot(diamonds, aes(carat, price)) +
geom_point()
# Add geom_smooth() with +
ggplot(diamonds, aes(carat, price)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Make the points 40% opaque
ggplot(diamonds, aes(carat, price, color = clarity)) +
geom_point(alpha=0.4) +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Draw a ggplot
plt_price_vs_carat <- ggplot(
# Use the diamonds dataset
diamonds,
# For the aesthetics, map x to carat and y to price
aes(x=carat, y=price)
)
# Add a point layer to plt_price_vs_carat
plt_price_vs_carat +
geom_point()
# Edit this to make points 20% opaque: plt_price_vs_carat_transparent
plt_price_vs_carat_transparent <- plt_price_vs_carat +
geom_point(alpha=0.2)
# See the plot
plt_price_vs_carat_transparent
# Edit this to map color to clarity,
# Assign the updated plot to a new object
plt_price_vs_carat_by_clarity <- plt_price_vs_carat +
geom_point(aes(color=clarity))
# See the plot
plt_price_vs_carat_by_clarity
Chapter 2 - Aesthetics
Visible Aesthetics:
Using Attributes:
Modifying Aestehtics:
Aesthetics Best Practices:
Example code includes:
mtcars <- mtcars %>%
mutate(fcyl=factor(cyl), fam=factor(am, levels=c(0, 1), labels=c("automatic", "manual")))
str(mtcars)
# Map x to mpg and y to fcyl
ggplot(mtcars, aes(x=mpg, y=fcyl)) +
geom_point()
# Swap mpg and fcyl
ggplot(mtcars, aes(x=fcyl, y=mpg)) +
geom_point()
# Map x to wt, y to mpg and color to fcyl
ggplot(mtcars, aes(x=wt, y=mpg, color=fcyl)) +
geom_point()
ggplot(mtcars, aes(wt, mpg, color = fcyl)) +
# Set the shape and size of the points
geom_point(shape=1, size=4)
# Map color to fam
ggplot(mtcars, aes(wt, mpg, fill = fcyl, color=fam)) +
geom_point(shape = 21, size = 4, alpha = 0.6)
# Base layer
plt_mpg_vs_wt <- ggplot(mtcars, aes(wt, mpg))
# Map fcyl to shape, not alpha
plt_mpg_vs_wt +
geom_point(aes(shape = fcyl))
# Base layer
plt_mpg_vs_wt <- ggplot(mtcars, aes(wt, mpg))
# Use text layer and map fcyl to label
plt_mpg_vs_wt +
geom_text(aes(label = fcyl))
# A hexadecimal color
my_blue <- "#4ABEFF"
# Change the color mapping to a fill mapping
ggplot(mtcars, aes(wt, mpg, fill = fcyl)) +
# Set point size and shape
geom_point(color=my_blue, size=10, shape=1)
ggplot(mtcars, aes(wt, mpg, color = fcyl)) +
# Add point layer with alpha 0.5
geom_point(alpha=0.5)
ggplot(mtcars, aes(wt, mpg, color = fcyl)) +
# Add text layer with label rownames(mtcars) and color red
geom_text(label=rownames(mtcars), color="red")
ggplot(mtcars, aes(wt, mpg, color = fcyl)) +
# Add points layer with shape 24 and color yellow
geom_point(shape=24, color="yellow")
# 5 aesthetics: add a mapping of size to hp / wt
ggplot(mtcars, aes(mpg, qsec, color = fcyl, shape = fam, size=hp/wt)) +
geom_point()
ggplot(mtcars, aes(fcyl, fill = fam)) +
geom_bar() +
# Set the axis labels
labs(x="Number of Cylinders", y="Count")
palette <- c(automatic = "#377EB8", manual = "#E41A1C")
ggplot(mtcars, aes(fcyl, fill = fam)) +
geom_bar() +
labs(x = "Number of Cylinders", y = "Count") +
# Set the fill color scale
scale_fill_manual("Transmission", values = palette)
palette <- c(automatic = "#377EB8", manual = "#E41A1C")
# Set the position
ggplot(mtcars, aes(fcyl, fill = fam)) +
geom_bar(position="dodge") +
labs(x = "Number of Cylinders", y = "Count") +
scale_fill_manual("Transmission", values = palette)
ggplot(mtcars, aes(mpg, 0)) +
geom_jitter() +
# Set the y-axis limits
ylim(c(-2, 2))
Chapter 3 - Geometries
Scatter Plots:
Histograms:
Bar Plots:
Line Plots:
Example code includes:
# Plot price vs. carat, colored by clarity
plt_price_vs_carat_by_clarity <- ggplot(diamonds, aes(carat, price, color = clarity))
# Set transparency to 0.5
plt_price_vs_carat_by_clarity +
geom_point(alpha = 0.5, shape = 16)
# Plot base
plt_mpg_vs_fcyl_by_fam <- ggplot(mtcars, aes(fcyl, mpg, color = fam))
# Default points are shown for comparison
plt_mpg_vs_fcyl_by_fam +
geom_point()
# Now jitter and dodge the point positions
plt_mpg_vs_fcyl_by_fam +
geom_point(position = position_jitterdodge(jitter.width=0.3, dodge.width=0.3))
data(iris)
ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
# Swap for jitter layer with width 0.1
geom_jitter(width=0.1, alpha=0.5)
ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
# Set the position to jitter
geom_point(position="jitter", alpha = 0.5)
ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
# Use a jitter position function with width 0.1
geom_point(position=position_jitter(width=0.1), alpha = 0.5)
data(Vocab, package="carData")
# Examine the structure of Vocab
str(Vocab)
# Plot vocabulary vs. education
ggplot(Vocab, aes(x=education, y=vocabulary)) +
# Add a point layer
geom_point()
ggplot(Vocab, aes(education, vocabulary)) +
# Set the shape to 1
geom_jitter(alpha = 0.2, shape=1)
datacamp_light_blue <- "#51A8C9"
ggplot(mtcars, aes(x=mpg, y=..density..)) +
# Set the fill color to datacamp_light_blue
geom_histogram(binwidth = 1, fill=datacamp_light_blue)
ggplot(mtcars, aes(mpg, fill = fam)) +
# Change the position to identity, with transparency 0.4
geom_histogram(binwidth = 1, position = "fill")
ggplot(mtcars, aes(mpg, fill = fam)) +
# Change the position to identity, with transparency 0.4
geom_histogram(binwidth = 1, position = "identity", alpha=0.4)
# Plot fcyl, filled by fam
ggplot(mtcars, aes(x=fcyl, fill=fam)) +
# Add a bar layer
geom_bar()
ggplot(mtcars, aes(x=fcyl, fill = fam)) +
# Set the position to "fill"
geom_bar(position="fill")
ggplot(mtcars, aes(fcyl, fill = fam)) +
# Change the position to "dodge"
geom_bar(position = "dodge")
ggplot(mtcars, aes(cyl, fill = fam)) +
# Change position to use the functional form, with width 0.2
geom_bar(position = position_dodge(width=0.2))
ggplot(mtcars, aes(cyl, fill = fam)) +
# Set the transparency to 0.6
geom_bar(position = position_dodge(width = 0.2), alpha=0.6)
# Plot education, filled by vocabulary
ggplot(Vocab, aes(x=education, fill = factor(vocabulary))) +
# Add a bar layer with position "fill"
geom_bar(position="fill")
# Plot education, filled by vocabulary
ggplot(Vocab, aes(education, fill = factor(vocabulary))) +
# Add a bar layer with position "fill"
geom_bar(position = "fill") +
# Add a brewer fill scale with default palette
scale_fill_brewer()
data(economics)
# Print the head of economics
head(economics)
# Using economics, plot unemploy vs. date
ggplot(economics, aes(x=date, y=unemploy)) +
# Make it a line plot
geom_line()
# Change the y-axis to the proportion of the population that is unemployed
ggplot(economics, aes(x=date, y=unemploy/pop)) +
geom_line()
load("./RInputFiles/fish.RData")
# Plot the Rainbow Salmon time series
ggplot(fish.species, aes(x = Year, y = Rainbow)) +
geom_line()
# Plot the Pink Salmon time series
ggplot(fish.species, aes(x = Year, y = Pink)) +
geom_line()
# Plot multiple time-series by grouping by species
ggplot(fish.tidy, aes(Year, Capture)) +
geom_line(aes(group = Species))
# Plot multiple time-series by coloring by species
ggplot(fish.tidy, aes(x = Year, y = Capture, color = Species)) +
geom_line()
Chapter 4 - Themes
Themes from Scratch:
Theme Flexibility:
Effective Explanatory Plots:
Example code includes:
recess <- data.frame(begin=as.Date(c('1969-12-01', '1973-11-01', '1980-01-01', '1981-07-01', '1990-07-01', '2001-03-01', '2007-12-01')),
end=as.Date(c('1970-11-01', '1975-03-01', '1980-07-01', '1982-11-01', '1991-03-01', '2001-11-01', '2009-07-30')),
event=c('Fiscal & Monetary\ntightening', '1973 Oil crisis', 'Double dip I', 'Double dip II', 'Oil price shock', 'Dot-com bubble', 'Sub-prime\nmortgage crisis'),
y=c(0.01416, 0.02067, 0.02951, 0.03419, 0.02767, 0.0216, 0.02521)
)
recess
events <- recess %>%
select(begin, y) %>%
rename(date=begin)
events
# Change the y-axis to the proportion of the population that is unemployed
plt_prop_unemployed_over_time <- ggplot(economics, aes(x=date, y=unemploy/pop)) +
geom_line(lwd=1.25) +
labs(title="The percentage of unemployed Americans\nincreases sharply during recessions") +
geom_rect(data=recess, aes(xmin=begin, xmax=end, ymin=0.01, ymax=0.055, fill="red"),
inherit.aes=FALSE, alpha=0.25
) +
geom_label(data=recess, aes(x=begin, y=y, label=event))
# View the default plot
plt_prop_unemployed_over_time
# Remove legend entirely
plt_prop_unemployed_over_time +
theme(legend.position="none")
# Position the legend at the bottom of the plot
plt_prop_unemployed_over_time +
theme(legend.position="bottom")
# Position the legend inside the plot at (0.6, 0.1)
plt_prop_unemployed_over_time +
theme(legend.position=c(0.6, 0.1))
plt_prop_unemployed_over_time +
theme(
# For all rectangles, set the fill color to grey92
rect = element_rect(fill = "grey92"),
# For the legend key, turn off the outline
legend.key = element_rect(color=NA)
)
plt_prop_unemployed_over_time +
theme(
rect = element_rect(fill = "grey92"),
legend.key = element_rect(color = NA),
# Turn off axis ticks
axis.ticks = element_blank(),
# Turn off the panel grid
panel.grid = element_blank()
)
plt_prop_unemployed_over_time +
theme(
rect = element_rect(fill = "grey92"),
legend.key = element_rect(color = NA),
axis.ticks = element_blank(),
panel.grid = element_blank(),
# Add major y-axis panel grid lines back
panel.grid.major.y = element_line(
# Set the color to white
color="white",
# Set the size to 0.5
size=0.5,
# Set the line type to dotted
linetype="dotted"
),
# Set the axis text color to grey25
axis.text = element_text(color="grey25"),
# Set the plot title font face to italic and font size to 16
plot.title = element_text(size=16, face="italic")
)
plt_mpg_vs_wt_by_cyl <- ggplot(mtcars, aes(x=wt, y=mpg, color=fcyl)) +
geom_point() +
labs(x="Weight (1000s of lbs)", y="Miles per Gallon")
# View the original plot
plt_mpg_vs_wt_by_cyl
plt_mpg_vs_wt_by_cyl +
theme(
# Set the axis tick length to 2 lines
axis.ticks.length=unit(2, "lines")
)
plt_mpg_vs_wt_by_cyl +
theme(
# Set the legend key size to 3 centimeters
legend.key.size = unit(3, "cm")
)
plt_mpg_vs_wt_by_cyl +
theme(
# Set the legend margin to (20, 30, 40, 50) points
legend.margin = margin(20, 30, 40, 50, "pt")
)
plt_mpg_vs_wt_by_cyl +
theme(
# Set the plot margin to (10, 30, 50, 70) millimeters
plot.margin=margin(10, 30, 50, 70, "mm")
)
# Whitespace means all the non-visible margins and spacing in the plot.
# To set a single whitespace value, use unit(x, unit), where x is the amount and unit is the unit of measure.
# Borders require you to set 4 positions, so use margin(top, right, bottom, left, unit)
# To remember the margin order, think TRouBLe
# The default unit is "pt" (points), which scales well with text
# Other options include "cm", "in" (inches) and "lines" (of text)
# Add a black and white theme
plt_prop_unemployed_over_time +
theme_bw()
# Add a classic theme
plt_prop_unemployed_over_time +
theme_classic()
# Add a void theme
plt_prop_unemployed_over_time +
theme_void()
# theme_gray() is the default.
# theme_bw() is useful when you use transparency.
# theme_classic() is more traditional.
# theme_void() removes everything but the data.
# Use the fivethirtyeight theme
plt_prop_unemployed_over_time +
ggthemes::theme_fivethirtyeight()
# Use Tufte's theme
plt_prop_unemployed_over_time +
ggthemes::theme_tufte()
# Use the Wall Street Journal theme
plt_prop_unemployed_over_time +
ggthemes::theme_wsj()
theme_recession <- theme(
rect = element_rect(fill = "grey92"),
legend.key = element_rect(color = NA),
axis.ticks = element_blank(),
panel.grid = element_blank(),
panel.grid.major.y = element_line(color = "white", size = 0.5, linetype = "dotted"),
axis.text = element_text(color = "grey25"),
plot.title = element_text(face = "italic", size = 16),
legend.position = c(0.6, 0.1)
)
theme_tufte_recession <- ggthemes::theme_tufte() + theme_recession
themeOld <- theme_get()
theme_set(themeOld)
# Set theme_tufte_recession as the default theme
theme_set(theme_tufte_recession)
plt_prop_unemployed_over_time +
# Add Tufte's theme
ggthemes::theme_tufte()
# Draw the plot (without explicitly adding a theme)
plt_prop_unemployed_over_time
plt_prop_unemployed_over_time +
ggthemes::theme_tufte() +
# Add individual theme elements
theme(
# Turn off the legend
legend.position = "none",
# Turn off the axis ticks
axis.ticks = element_blank()
)
plt_prop_unemployed_over_time +
ggthemes::theme_tufte() +
theme(
legend.position = "none",
axis.ticks = element_blank(),
axis.title = element_text(color = "grey60"),
axis.text = element_text(color = "grey60"),
# Set the panel gridlines major y values
panel.grid.major.y = element_line(
# Set the color to grey60
color="grey60",
# Set the size to 0.25
size=0.25,
# Set the linetype to dotted
linetype="dotted"
)
)
theme_set(themeOld)
data(gapminder, package="gapminder")
ctry <- c('Swaziland', 'Mozambique', 'Zambia', 'Sierra Leone', 'Lesotho', 'Angola', 'Zimbabwe', 'Afghanistan', 'Central African Republic', 'Liberia', 'Canada', 'France', 'Israel', 'Sweden', 'Spain', 'Australia', 'Switzerland', 'Iceland', 'Hong Kong, China', 'Japan')
gm2007 <- gapminder %>%
filter(year==2007, country %in% ctry) %>%
select(country, lifeExp, continent) %>%
arrange(lifeExp)
gm2007
# Set the color scale
palette <- RColorBrewer::brewer.pal(5, "RdYlBu")[-(2:4)]
# Add a title and caption
plt_country_vs_lifeExp <- ggplot(gm2007, aes(x = lifeExp, y = fct_reorder(country, lifeExp), color = lifeExp)) +
geom_point(size = 4) +
geom_segment(aes(xend = 30, yend = country), size = 2) +
geom_text(aes(label = round(lifeExp,1)), color = "white", size = 1.5) +
scale_x_continuous("", expand = c(0,0), limits = c(30,90), position = "top") +
scale_color_gradientn(colors = palette) +
labs(title="Highest and lowest life expectancies, 2007", caption="Source: gapminder")
plt_country_vs_lifeExp
# Define the theme
plt_country_vs_lifeExp +
theme_classic() +
theme(axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.text = element_text(color="black"), axis.title = element_blank(), legend.position = "none")
global_mean <- gapminder %>% filter(year==2007) %>% pull(lifeExp) %>% mean()
x_start <- global_mean + 4
y_start <- 5.5
x_end <- global_mean
y_end <- 7.5
# Add text
plt_country_vs_lifeExp +
theme_classic() +
theme(axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.text = element_text(color="black"), axis.title = element_blank(), legend.position = "none") +
geom_vline(xintercept = global_mean, color = "grey40", linetype = 3) +
annotate("text", x = x_start, y = y_start, label = "The\nglobal\naverage", vjust = 1, size = 3, color = "grey40")
# Add a curve
plt_country_vs_lifeExp +
theme_classic() +
theme(axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.text = element_text(color="black"), axis.title = element_blank(), legend.position = "none") +
geom_vline(xintercept = global_mean, color = "grey40", linetype = 3) +
annotate("text", x = x_start, y = y_start, label = "The\nglobal\naverage", vjust = 1, size = 3, color = "grey40") +
annotate("curve", x = x_start, y = y_start, xend = x_end, yend = y_end, arrow = arrow(length = unit(0.2, "cm"), type = "closed"), color = "grey40")
theme_set(themeOld)
Chapter 1 - Data Pre-processing and Visualization
Data Normalization:
Handling Missing Data:
Detecting Anomalies in Data:
Example code includes:
# fifa_sample <- read_csv("./RInputFiles/fifa_sample.xls")
# glimpse(fifa_sample)
apps <- read_csv("./RInputFiles/googleplaystore.xls")
apps <- apps[-10473, ]
glimpse(apps)
cars <- read_csv("./RInputFiles/car-fuel-consumption-1.xls")
glimpse(cars)
fifa_sample <- tibble::tibble(SP=c(43, 70, 47, 22, 74, 45, 65, 71, 66, 62, 58, 55, 57, 15, 67, 66, 46, 65, 71, 80, 68, 62, 49, 70, 55, 17, 56, 48, 25, 62, 14, 55, 17, 43, 62, 63, 52, 62, 58, 81, 73, 59, 60, 66, 43, 59, 58, 79, 16, 64, 14, 12, 68, 78, 36, 52, 59, 67, 75, 80, 38, 73, 56, 80, 66, 68, 72, 41, 72, 51, 66, 37, 75, 19, 15, 34, 69, 86, 74, 57, 80, 51, 76, 63, 22, 76, 43, 22, 46, 39, 55, 81, 77, 62, 81, 19, 70, 74, 60, 59),
RA=c(190, 12, 353, 669, 2.5, 2.6, 406, 18.6, 5.1, 653, 900, 450, 3.9, 713, 1.9, 4.8, 140, 1.6, 1.3, 38.1, 1.6, 953, 891, 2.2, 357, 149, 3.4, 1.7, 7, 347, 105, 2.4, 1.9, 73, 4.8, 801, 3.8, 1.2, 9.5, 6.8, 2.5, 656, 1.5, 7, 631, 1.9, 125, 6.4, 2.6, 648, 1.3, 1.3, 6.7, 20.8, 3.6, 305, 1, 357, 7.5, 17.1, 140, 1.4, 3, 10.3, 795, 6.5, 2.6, 530, 2.7, 495, 12.8, 850, 1.2, 436, 639, 945, 619, 164, 10.2, 639, 5, 365, 1.2, 350, 63, 11.7, 8.7, 534, 2.5, 413, 225, 15.2, 1.6, 534, 14.7, 119, 6.9, 20, 1.5, 512)
)
# Glimpse at the dataset
glimpse(fifa_sample)
# Compute the scale of every feature
(fifa_scales <- sapply(fifa_sample, range))
# Plot fifa_sample data
ggplot(fifa_sample, aes(x=SP, y=RA)) +
geom_point(colour="blue", size=5) +
labs(title = "Original data", x="Shot power", y="Release amount (millions EUR)") +
theme(plot.title = element_text(size=22), text = element_text(size=18)) +
scale_x_continuous(breaks = round(seq(0, max(fifa_sample$SP), by = 5),1))
# Apply max-min and standardization: fifa_normalized
fifa_normalized <- fifa_sample %>%
mutate(SP_MaxMin = (SP-min(SP))/(max(SP)-min(SP)), RA_MaxMin = (RA-min(RA))/(max(RA)-min(RA)),
SP_ZScore = (SP - mean(SP)) / sd(SP), RA_ZScore = (RA - mean(RA)) / sd(RA)
)
# Compute the scale of every feature: fifa_normalized_scales
(fifa_normalized_scales <- sapply(fifa_normalized, range))
# Boxplot of original and normalized distributions
boxplot(fifa_normalized[, c("SP", "RA")], main = 'Original')
boxplot(fifa_normalized[, c("SP_MaxMin", "RA_MaxMin")], main = 'Max-Min')
boxplot(fifa_normalized[, c("SP_ZScore", "RA_ZScore")], main = 'Z-Score')
bands <- tibble::tibble(Blade_pressure=c('20', '20', '30', '30', '30', '28', '30', '28', '60', '32', '30', '40', '30', '25', '20', '?', '?', '?', '?', '?', '30', '30', '25', '30', '25', '20', '30', '25', '30', '35', '28', '30', '22', '20', '35', '?', '30', '28', '31', '34', '32', '?', '30', '30', '24', '20', '35', '25', '25', '34', '16', '20', '28', '25', '30', '35', '46', '50', '25', '30'),
Roughness=c('0.75', '0.75', '?', '0.312', '0.75', '0.438', '0.75', '0.75', '0.75', '1.0', '0.75', '0.75', '1.0', '0.625', '1.0', '1.0', '?', '?', '0.75', '0.75', '0.812', '0.812', '0.812', '1.0', '1.0', '1.0', '1.0', '1.0', '0.75', '0.75', '0.75', '0.75', '0.625', '0.625', '0.75', '0.875', '0.625', '1.0', '1.0', '0.75', '1.0', '0.875', '0.875', '0.812', '0.75', '0.75', '0.812', '0.625', '0.625', '0.5', '0.75', '0.75', '0.75', '0.875', '0.625', '?', '0.75', '0.75', '0.625', '0.875'),
Ink_pct=c('50.5', '54.9', '53.8', '55.6', '57.5', '53.8', '62.5', '62.5', '60.2', '45.5', '48.5', '52.6', '50.0', '59.5', '49.5', '62.5', '62.5', '58.8', '54.9', '56.2', '58.8', '62.5', '58.1', '62.5', '57.5', '57.5', '57.5', '58.8', '58.8', '58.8', '45.0', '43.5', '54.3', '53.2', '58.8', '63.0', '58.1', '58.8', '54.3', '62.5', '58.1', '61.7', '55.6', '55.6', '58.1', '56.2', '58.8', '57.5', '58.8', '61.0', '50.5', '50.5', '58.8', '58.8', '62.5', '55.6', '58.8', '62.5', '52.6', '54.9'),
Ink_temperature=c('17.0', '15.0', '16.0', '16.0', '17.0', '16.8', '16.5', '16.5', '12.0', '16.0', '16.0', '14.0', '15.0', '14.5', '16.0', '15.0', '14.0', '15.5', '16.4', '16.5', '16.0', '15.0', '16.3', '15.8', '14.5', '14.0', '15.0', '15.2', '15.0', '17.0', '16.0', '16.5', '14.1', '14.0', '17.0', '15.4', '15.0', '16.0', '15.0', '15.0', '16.0', '15.4', '16.0', '16.3', '15.8', '16.6', '17.0', '13.0', '14.0', '15.9', '17.0', '16.5', '15.0', '16.5', '18.0', '17.0', '12.0', '16.0', '14.6', '24.5'),
Band_type=c('band', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'band', 'noband', 'noband', 'band', 'band', 'band', 'noband', 'band', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'band', 'band', 'band', 'noband', 'noband', 'band', 'band', 'noband', 'noband', 'noband', 'noband', 'band', 'noband', 'band', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'band', 'band', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'band', 'band')
)
str(bands)
# Check for missing values using base R and naniar functions
any(is.na(bands))
naniar::any_na(bands)
# What? No missing values! Take a closer glimpse
glimpse(bands)
# Replace ? with NAs: bands
bands <- naniar::replace_with_na_all(bands, ~.x == '?')
# Compute missingness summaries
naniar::miss_var_summary(bands)
# Visualize overall missingness
naniar::vis_miss(bands)
# Visualize overall missingness, clustered
naniar::vis_miss(bands, cluster = TRUE)
# Visualize missingness in each variable
naniar::gg_miss_var(bands)
# Missingness in variables, faceted by Band_type
naniar::gg_miss_var(bands, facet = Band_type)
# Visualize missingness in cases
naniar::gg_miss_case(bands)
# Impute with the mean
imp_mean <- bands %>%
naniar::bind_shadow(only_miss = TRUE) %>%
naniar::add_label_shadow() %>%
naniar::impute_mean_all()
# Impute with lm
imp_lm <- bands %>%
naniar::bind_shadow(only_miss = TRUE) %>%
naniar::add_label_shadow() %>%
simputation::impute_lm(Blade_pressure ~ Ink_temperature) %>%
simputation::impute_lm(Roughness ~ Ink_temperature) %>%
simputation::impute_lm(Ink_pct ~ Ink_temperature)
# Peek at the first few rows of imp_models_long
# head(imp_models_long)
# Visualize post-imputation distributions
# ggplot(imp_models_long, aes(x = imp_model, y = value)) +
# geom_violin(aes(fill=imp_model)) +
# facet_wrap(~variable, scales='free_y')
# Calculate post-imputation distribution stats
# imp_models_long %>%
# group_by(imp_model, variable) %>%
# summarize(var = var(value), avg = mean(value),
# median = median(value)) %>%
# arrange(variable)
# Peek at the cars dataset
head(cars)
# Boxplot of consume variable distribution
boxplot(cars$consume)
# Five-number summary: consume_quantiles
(consume_quantiles <- quantile(cars$consume))
# Calculate upper threshold: upper_th
upper_th <- consume_quantiles["75%"] + 1.5 * (consume_quantiles["75%"] - consume_quantiles["25%"])
# Print the sorted vector of distinct potential outliers
sort(unique(cars$consume[cars$consume > upper_th]))
# Scale data and create scatterplot: cars_scaled
cars_scaled <- cars %>%
select(distance, consume) %>%
scale() %>%
as.data.frame()
plot(distance ~ consume, data = cars_scaled, main = 'Fuel consumption vs. distance')
# Compute KNN score
cars_knn <- FNN::get.knn(data = cars_scaled, k = 7)
cars$knn_score <- rowMeans(cars_knn$nn.dist)
# Print top 5 KNN scores and data point indices: top5_knn
(top5_knn <- order(cars$knn_score, decreasing = TRUE)[1:5])
print(cars$knn_score[top5_knn])
# Plot variables using KNN score as size of points
plot(distance ~ consume, cex = knn_score, data = cars, pch = 20)
# Scale cars data: cars_scaled
cars_scaled <- cars %>%
select(distance, consume, knn_score) %>%
scale() %>%
as.data.frame()
# Add lof_score column to cars
cars$lof_score <- dbscan::lof(cars_scaled, k = 7)
# Print top 5 LOF scores and data point indices: top5_lof
(top5_lof <- order(cars$lof_score, decreasing = TRUE)[1:5])
print(cars$lof_score[top5_lof])
# Plot variables using LOF score as size of points
plot(distance ~ consume, cex = lof_score, data = cars, pch = 20)
Chapter 2 - Supervised Learning
Interpretable Models:
Regularization:
Bias and Variance:
Building Ensemble Models:
Example code includes:
car <- cars %>%
select(distance, consume, speed, temp_outside, gas_type, AC) %>%
mutate(gas_type=factor(gas_type), AC=factor(AC, labels=c("Off", "On")))
test_instance <- tibble::tibble(distance=12.4, consume=5.1, speed=45, temp_outside=5,
gas_type=factor("E10", levels=c("E10", "SP98")),
AC=factor("Off", levels=c("Off", "On"))
)
test_instance
# Glimpse on the car dataset
glimpse(car)
# Build a multivariate regression model: car_lr
car_lr <- lm(consume ~ ., data = car)
# Summarize the model and display its coefficients
summary(car_lr)$coef
# Predict with linear regression model
predict(car_lr, test_instance)
# Build a regression tree: car_dt
car_dt <- rpart::rpart(consume ~ ., data = car)
# Fancy tree plot
rattle::fancyRpartPlot(car_dt)
# Extract rules from the tree
rpart.plot::rpart.rules(car_dt)
# Predict test instance with decision tree
predict(car_dt, test_instance)
fifaRaw <- c(0.569, -1.555, -0.068, -0.705, -1.342, -0.28, -0.068, -1.98, -0.28, 0.357, -1.767, 0.357, -0.28, -0.918, -0.068, -0.068, 2.693, -0.918, 0.357, -0.068, -0.918, 0.144, -0.493, -1.767, -0.28, 1.419, 0.357, -0.28, 1.844, -0.493, -1.342, -1.342, -0.28, -1.13, 1.207, 2.269, -0.28, 1.419, 0.357, -0.068, -0.918, -0.918, 0.569, -0.28, 0.144, -0.28, -0.705, -0.28, 2.693, -1.342, 0.782, 0.357, -1.555, 0.994, 1.207, 0.994, 0.357, -0.918, -0.28, -0.493, -0.705, 0.144, -0.068, 1.207, -1.13, -0.918, -0.918, -0.068, -1.555, -0.068, 1.207, 0.357, -1.342, 0.569, -0.493, 1.631, -0.918, 0.994, 1.207, -0.068, 0.357, -0.705, 0.782, -0.068, 2.481, 1.631, -0.918, -0.918, -1.767, 1.631, 0.782, -0.28, 0.569, -0.705, 0.144, 2.481, -1.342, 0.782, 0.144, 0.144, -1.342, -0.493, -0.918, -1.555, 0.357, -1.555, -0.493, -0.493, -0.493, 0.569, -1.342, -0.28, 1.207, -1.555, -1.342, -0.068, 1.419, -1.555, 0.569, -0.493, 1.207, 0.144, -0.705, -0.493, 2.056, -1.342, 1.207, -1.13, 0.782, -0.068, -0.493, -1.13, -0.068, -0.493, -0.068, 0.994, -1.13, 0.357, -1.13, -0.068, -1.13, -0.705, -0.068, -0.28, -0.705, 0.782, 1.207, 1.631, 0.994, -1.13, -1.13, 0.994, 0.569, 0.994, 0.782, 0.144, 0.144, 0.144, -1.13, -0.068, -0.705, 0.144, 0.357, 0.782, 0.782, -0.705, -0.28, -0.705, 1.631, 2.056, 1.631, -0.068, -1.13, 0.144, -0.28, 1.207, -0.493, 1.419, -1.555, 0.782, -1.13, -1.13, -1.13, -0.068, 0.782, -0.918, 0.144, 0.569, -0.068, -0.493, 0.994, -0.918, -1.342, -0.493, -0.918, 0.569, -0.28, -0.705, 0.144, -1.342, 0.994, -1.555, 1.207, -0.28, 2.056, -1.13, -0.918, 1.631, 0.357, -0.705, -0.493, 2.056, -0.493, 1.844, 0.782, -0.068, 0.144, -0.705, -0.918, -0.28, 0.782, -1.555, -0.918, 2.056, 0.994, -0.493, -0.493, 0.144, -1.342, -1.342, 0.782, 0.994, 0.144, -0.28, 0.144, -0.493, -0.493, -0.918, 0.782, -1.342, 0.144, 0.144, -0.28, 1.419, -0.068, -0.068, 0.782, -1.767, -0.918, -0.068, -0.493, 0.357, -1.13, -0.28, 0.357, -1.13, -0.28, 0.994, -1.555, 1.207, -0.28, -0.705, -1.13, 0.144, -0.918, -0.705, 0.994, 0.357, 0.357, -0.705, -0.493, -0.493, 0.357, -0.28, -0.493, 0.569, -1.342, -0.068, 0.569, 2.269, -0.493, 1.419, 1.631, -1.13, 0.144, 2.056, -0.068, 0.994, 1.631, -0.068, 1.419, 2.481, 1.207, -0.28, 2.056, -1.555, -0.918, 0.782, 0.782, -0.493, 1.631, -0.28, -1.342, -0.918, -1.555, -0.068, 0.569, 1.207, -0.068, 0.569, -0.705, -0.28, -1.342, 1.631, 0.357, -0.068, -1.555, 0.357, 0.782, -0.705, -1.13, -0.493, -0.28, -0.918, 0.994, -0.493, -1.342, 0.357, 0.782, -0.918, -0.28, 1.419, 0.144, -1.13, -0.28, -0.28, 0.569, -1.342, -0.918, -1.342, 1.844, -0.068, -0.28, -0.068, 0.144, -0.28, -1.767, -0.28, -0.705, 0.144, -1.13, -1.13, -0.705, 0.569, -0.068, -1.342, 0.357, 0.144, 2.056, -0.705, 1.631, 0.782, -1.342, 1.419, -0.28, -0.28, 0.144, 1.419, 1.631, 0.569, -0.705, 2.056, -1.767, -0.918, -0.28, -1.555, -0.068, -0.068, 0.144, -1.555, -0.493, -0.068, -0.068, -0.705, -0.28, -1.13, -0.068, 1.419, 2.056, -0.493, -0.918, -0.705, -0.918, 1.419, -0.493, -0.28, 0.144, -0.493, -0.918, -1.342, -0.28, 0.357, -1.13, 1.631, 0.782, 0.357, 0.994, 0.782, -0.068, -0.918, -0.28, 1.844, -0.28, 0.782, 0.357, -1.13, -1.13, 0.569, 0.569, -0.068, 0.782, 0.357, 0.782, 0.144, 1.844, 1.207, 0.144, 0.357, 0.357, -1.13, -1.767, -0.068, 2.056, -1.342, 1.631, -0.068, 1.631, -0.068, 0.357, 1.419, 0.782, 0.569, -0.918, 0.569, -0.918, 1.419, -0.28, 0.569, 0.994, 0.357, 0.782, 0.357, 1.207, 0.782, -0.918, -0.493, -0.28, -1.98, 0.994, -1.342, -1.342, 0.357, 0.144, -0.493, -0.068, -1.342, -0.705, -0.918, 0.357, -1.555, 0.357, 1.419, 0.357, -0.068, 1.419, 1.631, 1.419, 1.419, 0.144, -0.493, 0.569, 1.844, 0.569, -1.13, -0.28, 1.631, 1.844, 1.207, -0.705, -1.555, -1.342, -0.705, -0.705, 2.056, 1.419, -0.918, -0.493, 0.994, -0.705, 0.782, -0.198, 1.373, -0.023, -0.198, -0.023, 2.246, -0.198, 0.152, -0.547, -0.547, 0.675, -0.721, -0.896, 0.501, 1.199, -1.943, -0.023, 2.595, -0.372, -2.467, -0.198, -1.245, -0.372, -0.023, -0.023)
fifaRaw <- c(fifaRaw, -1.419, -0.896, -0.372, -0.198, 1.548, -1.245, -0.198, -0.547, 1.024, 1.548, -0.721, -0.721, 0.152, -0.198, 0.501, 1.722, 0.85, -0.547, -0.198, 0.326, 1.024, 0.85, 0.326, -0.198, -0.023, 1.199, -0.721, 1.548, 0.501, -1.07, 0.152, 0.675, -0.198, -1.594, 0.501, -0.198, 0.85, -0.198, -0.198, -0.023, 1.024, -0.023, -1.07, 0.85, -0.547, -1.594, -1.07, 0.675, 3.293, 0.326, -0.023, 1.373, -1.245, -0.372, 1.373, 0.326, 0.501, -0.547, -0.896, 0.675, -0.721, -1.245, -0.547, 0.85, 0.326, 0.501, -0.372, -0.372, 0.675, -0.372, 0.152, 0.501, 0.675, -0.547, -0.896, 0.85, 0.85, 1.199, -0.547, -0.547, 0.501, -1.07, -0.023, 0.326, 0.326, 0.501, 0.501, 1.024, 0.152, 0.675, 1.024, -1.07, -0.198, -1.07, 0.501, -1.245, -0.198, 1.024, 1.548, -0.547, -1.07, 0.326, 1.722, 0.501, -0.721, -0.547, -0.198, -0.547, -0.896, -0.198, -1.245, -0.198, -1.594, -1.768, -0.023, 1.897, 0.152, 2.421, 1.199, -1.07, -0.023, -0.023, 0.85, -0.023, 0.675, -0.198, -0.896, -0.547, -0.023, 0.675, -0.372, -0.896, 0.326, 1.024, 0.85, 0.501, -0.198, -1.245, -0.547, -0.372, 0.85, -0.896, 2.071, -0.547, -0.198, -1.245, 0.326, 0.326, -0.721, -0.198, -1.245, 0.152, -0.896, 0.501, -1.07, -0.372, -2.292, 0.326, 0.501, -1.07, 1.199, 0.152, -1.07, -1.943, -1.245, -0.198, 1.199, -1.768, 0.326, -1.245, -0.721, 0.326, 0.326, 0.85, 1.548, -0.023, -1.245, -1.07, -0.023, -0.547, 0.326, 1.024, -0.198, 0.326, 1.199, 0.85, -0.023, 1.199, -1.419, 1.199, -0.896, -0.023, 0.501, 0.152, -2.118, -0.198, -0.023, -0.023, -1.419, 1.024, 0.85, 0.501, -1.07, 0.85, -0.198, -1.245, -0.721, -0.721, 1.897, 0.326, 0.152, 1.024, -0.198, 0.326, 0.501, -0.198, -1.07, 1.199, -1.245, 0.675, -0.721, -0.547, 0.152, -1.07, -1.245, 0.85, -0.023, 1.373, 1.024, -0.547, -0.372, 1.024, -0.547, -0.372, -2.118, -1.768, -1.07, 1.024, -2.816, 1.024, 0.501, -0.372, -0.372, 1.897, 0.152, -0.721, 2.944, 1.024, 1.199, -0.721, -1.594, -0.896, -0.372, 0.85, -1.245, -0.372, 0.501, -0.547, 2.071, 1.548, -0.547, 1.548, -0.023, 1.024, -0.721, -0.198, 0.675, 0.85, -1.245, -1.594, 0.85, -1.419, 0.675, 0.675, -0.721, -1.768, 3.293, 0.326, 0.326, 1.024, -0.547, 0.326, 2.246, -0.198, -0.547, 1.373, 0.152, 1.199, 1.024, 0.152, -1.419, -0.372, -0.023, -0.372, -0.372, -0.023, 1.373, 1.024, 0.326, 0.675, 2.246, -0.547, 1.548, -0.372, -0.547, -0.023, -0.198, -1.07, 0.326)
fifaRaw <- c(fifaRaw, 0.675, 0.152, -1.419, 1.373, -0.023, 0.152, -0.547, 1.373, -0.023, 0.501, -0.198, -0.023, 0.501, 0.152, 1.199, -1.07, 2.421, -1.07, 1.024, 0.326, -0.198, -1.245, -1.245, -0.198, -0.372, -2.467, 0.501, 0.152, -0.023, -0.198, 0.326, -0.896, 0.501, 1.897, -0.547, -0.721, 0.501, -0.372, 1.373, 0.675, 0.326, 0.152, -1.07, 0.675, -0.198, -0.198, 0.152, -0.721, -0.198, -0.372, -0.023, -0.896, 0.675, -0.198, 0.675, 1.373, -1.594, 1.373, 0.326, -1.768, 1.722, 1.024, 0.326, 1.548, 1.722, -1.245, -2.292, 0.675, -0.547, -1.07, -0.547, -0.721, -1.419, 0.85, 0.501, 0.501, 0.501, -1.943, 1.199, -0.896, 0.152, 2.071, 0.152, -0.372, -0.721, 2.071, -0.896, -1.245, -1.07, -0.198, -2.292, -1.245, -0.023, -0.547, 0.501, -0.547, 0.85, -0.372, -0.372, -0.547, -0.023, -0.547, 0.675, -0.023, 0.85, 0.326, -0.198, -1.07, 0.152, 0.326, -0.372, -1.419, -1.245, 0.501, -0.372, -1.07, -0.896, -0.198, -0.023, 1.199, 1.373, -0.896, -0.896, 0.675, 0.326, -1.594, -0.372, -0.721, -0.547, 1.722, -0.547, -1.245, 1.722, 0.501, 0.326, 1.548, 0.675, -1.768, -0.372, -1.245, -1.245, -0.721, -1.245, 3.468, 0.675, -0.023, 0.152, -1.768, -0.198, -0.198, -1.594, 0.675, 0.501, -1.245, 0.85, 1.548, -0.023, 1.897, 0.675, -1.419, 0.85, -1.419, 1.199, 1.199, -0.372, -1.594, 0.618, 0.095, 0.409, -1.841, 0.042, -1.789, -0.586, -0.167, 0.775, 0.566, -0.167, 0.775, 0.461, 0.147, 1.351, -1.056, 1.508, 0.827, 0.775, 0.513, -1.475, 0.461, -1.789, -0.115, 0.775, -1.894, 0.88, 0.88, 1.194, 0.461, -0.69, 0.356, 0.461, 0.513, -1.632, 0.618, 0.566, 0.566, -0.586, 0.513, -1.004, 0.461, -1.737, 1.141, 1.194, 0.984, 0.042, 0.409, -1.946, -1.318, 1.455, 0.618, -0.01, -1.841, -0.481, -1.841, 1.351, -1.423, -1.946, 0.723, 0.461, 0.618, 0.513, 0.775, 0.304, 0.147, 0.618, -1.004, 0.827, 0.775, -0.219, 0.618, 0.67, 2.031, 0.042, -2.051, 0.409, 1.351, 0.618, -0.376, -0.324, -1.109, 1.037, -1.475, -1.998, 0.513, -0.272, 0.252, -0.952, 0.095)
fifaRaw <- c(fifaRaw, -0.062, 0.199, 0.775, -0.01, 0.984, 0.409, -0.899, 1.194, 0.67, -0.324, -0.743, -0.115, 0.827, -1.318, 1.089, -1.423, 0.252, 0.566, -0.847, 1.665, -1.894, -1.318, 1.037, -1.998, -0.952, 1.298, 0.67, 0.147, 0.042, -0.272, 0.88, 1.246, -1.946, 1.141, -1.946, -1.318, -0.272, 1.298, 0.042, -1.58, 0.566, -1.109, 0.356, 0.513, 0.095, 0.147, 0.566, -1.527, -0.062, -1.266, 0.67, -0.167, 1.56, 0.67, 0.304, 0.984, 1.194, -1.946, 1.141, 0.199, -0.272, 0.932, 0.409, -0.167, 1.037, 0.827, 0.67, 0.984, -0.01, -1.37, 0.67, 0.566, 0.723, 0.67, -2.051, 0.67, -0.899, -1.841, -1.737, -1.946, 0.566, 0.618, 0.409, -1.737, 0.252, 0.409, 0.095, 0.566, -0.429, 0.252, 0.461, -2.051, 0.252, 0.67, 0.775, -0.586, 0.566, 0.199, -0.01, 0.304, -0.376, -0.167, -1.894, -1.841, -1.056, 1.037, 0.199, 0.513, 0.618, -2.051, -1.894, -0.115, 0.932, 0.984, -0.272, 0.042, -0.062, 1.351, 1.351, 1.612, 0.827, -0.795, 1.089, 0.723, 1.298, 0.566, 1.141, -1.056, 0.304, 0.095, 0.88, -0.69, 0.356, 0.775, 1.403, -0.743, 0.566, 0.67, 0.042, -1.894, 0.513, 0.618, -0.219, 0.461, -1.841, 0.775, 0.827, -2.051, 0.147, 0.409, -1.789, 0.252, 0.827, 0.199, 0.409, 0.356, -0.952, -1.109, -0.586, 0.618, 0.984, -0.115, -0.69, 0.513, -0.899, 0.461, -0.638, 0.932, -1.946, -0.167, 0.304, -0.899, -1.998, 0.461, -0.219, 0.618, 0.304, 1.141, 0.618, 0.984, -1.056, -0.115, 0.409, -2.051, 0.409, 0.304, -0.533, -0.219, -0.376, -1.998, 0.356, 0.513, -2.155, -0.533, 0.042, 1.194, 1.246, 0.932, 0.827, -1.789, 0.775, 0.618, 1.455, -0.219, -1.841, -1.998, 0.199, 0.513, 0.095, 0.461, -1.632, 1.508, 0.461, -1.998, 1.141, -1.37, 1.298, 1.978, 0.723, 0.984, 0.723, 0.042, -0.795, 1.508, -0.324, 0.409, -1.004, 0.984, 1.194, -0.481, 0.775, -0.481, -1.998, 0.932, 0.356, 1.455, -0.533, 0.095, -0.324, -1.423, -0.533, -1.737, 0.461, 0.147, -1.998, 0.775, 0.566, -1.109, -0.01, -1.946, 0.042, -0.952, 0.618, 1.351, -0.69, 0.304, -1.056, -0.167, 1.403, 0.513, 1.298, -0.638, 0.304, 1.298, -1.737, -0.69, 1.037, -1.161, -1.789, -0.899, 1.403, -0.272, 0.252, 0.356, 0.88, -0.272, 0.827, 0.147, 0.461, -0.795, 0.932, -1.841, -0.69, -1.946, 0.618, 0.199, 0.775, 0.67, 0.775, -0.481, -0.01, -1.004, 0.409, -0.272, 0.199, -0.062, -1.841, 1.717, -1.266, 0.566, 0.304, 0.042, 0.095, -2.208, 0.566, 0.199, 0.827, 0.618, 0.513, -0.272, -1.58, 0.67, 0.304, -0.115, 1.298, 0.618, 0.461, 1.298, -1.737, 0.513, 1.141, 0.67, 0.88, 0.356, -0.69, 0.147, -1.004, 0.827, 0.461, 1.926, 0.461, 0.67, 1.194, -0.69, -1.841, 0.775, 0.775, 0.88, 1.141, 0.042, -1.894, -0.743, -1.998, -0.376, 0.67, -0.743, 0.827, -1.004, 1.194, -0.01, 1.141, 1.037, 0.304, -0.167, -0.847, 0.513, 0.461, 1.194, -1.894, 0.304, 0.042, 0.984, -0.219, -1.946, -1.632, -1.266, 0.775, 0.147, 0.618, -0.638, -1.161, 0.88, 0.461)
fifaRaw <- c(fifaRaw, 0.147, 0.095, -0.272, 0.88, 0.566, -1.998, -0.429, 1.351, 0.304, 0.723, -1.213, -1.475, -1.789, -1.737, 0.88, 0.775, -1.109, 0.199, 0.513, 0.461, 0.618, -0.115, 0.88, 1.351, 0.199, -0.638, -0.69, 0.252, 0.409, 0.618, -2.051, -0.01, -0.481, 0.513, 1.612, 0.67, -1.318, 1.232, 0.475, 1.081, -1.242, -0.636, -1.646, -0.283, 0.576, 0.475, 0.02, -0.485, 1.182, 0.525, 0.273, 1.182, 0.879, 1.131, 1.485, 0.879, -0.182, -1.444, -0.03, -1.646, -1.091, -1.04, -1.444, 0.778, -0.485, 0.273, 0.323, -1.444, -0.586, -0.586, -0.535, -1.697, 1.081, 0.576, -1.242, -0.737, 0.424, -0.434, -0.283, -1.798, 1.03, 0.02, 1.434, 1.333, 0.374, -1.697, -0.081, 1.384, 0.424, 0.576, -1.646, 0.677, -1.495, -0.182, -0.788, -2, 1.131, 0.828, -0.182, 1.03, -0.384, 0.626, 0.828, 0.172, -1.192, 0.172, 1.182, 0.828, 1.03, -0.03, 1.586, 0.02, -1.646, 1.03, 0.576, -0.788, 1.535, -0.737, -1.04, -0.081, 0.929, -1.697, 1.131, -1.444, -1.242, -0.99, -0.081, -0.737, 0.879, -0.485, 0.576, 1.182, 1.535, -0.889, 1.182, -0.737, 1.081, 1.081, 0.778, -0.384, -1.343, -0.737, -1.04, -0.687, 0.626, -0.232, 0.626, -1.848, -0.434, 0.071, -1.848, -1.394, 0.727, 0.475, 0.525, 1.131, 1.283, 0.02, 0.929, -1.293, 0.929, -1.646, -1.04, -0.485, 0.879, -0.434, -1.293, 0.677, -0.939, -0.384, 0.576, 1.182, 0.071, 0.677, -1.697, -0.788, -1.394, 0.98, -0.081, 1.333, 1.081, 0.626, 0.02, 0.778, -1.697, 1.03, 0.374, 0.929)
fifaRaw <- c(fifaRaw, 0.929, -0.838, -0.636, 0.475, 0.98, 0.879, 0.424, 0.778, -1.394, 0.525, 1.434, -0.182, -1.04, -1.444, 0.929, 1.232, -1.495, -1.495, -1.646, 0.424, 1.232, -0.636, -1.747, -0.586, 0.576, 1.182, 0.677, 0.323, -0.131, 0.172, -1.899, 0.929, -0.485, 0.626, 1.232, 0.626, -0.535, -0.384, -0.232, -1.596, 0.98, -2, -1.343, -1.04, 0.475, 1.182, -0.737, 0.727, -2, -1.697, 0.071, 0.626, -0.687, 1.131, -0.485, 0.98, 0.727, 1.283, 1.232, 1.081, -0.485, 0.424, 0.273, 1.687, 0.121, 0.273, -1.192, -0.737, -1.192, 0.677, -0.788, 0.172, -0.283, 0.475, -1.343, 0.071, 0.98, -0.131, -1.798, -0.485, -0.03, 0.929, -0.03, -1.242, 0.677, 1.283, -1.848, -1.091, 0.576, -1.545, 0.374, 1.586, 0.626, 0.424, 0.778, -0.838, 0.727, -0.99, 0.475, 1.081, 0.374, -0.838, 0.323, -0.99, 0.525, -1.091, 0.879, -1.899, 0.374, 0.172, -1.141, -1.848, -0.636, -0.687, -0.636, 1.434, -0.384, 0.02, 0.727, 0.727, -0.384, 1.636, -1.747, 0.677, -0.636, 0.525, 0.677, 1.384, -1.596, -0.384, 1.131, -1.747, 1.434, 1.838, 1.182, -0.636, 0.576, 1.788, -1.848, 0.475, 0.525, 0.778, 0.677, -1.545, -1.899, 0.626, 0.323, 1.03, 0.778, -1.596, 1.788, -0.384, -1.899, 0.02, -1.242, 1.081, 1.687, -1.293, -0.586, 1.232, -0.636, -0.889, 1.081, 0.475, 0.626, 0.576, 0.172, 0.778, 0.677, -0.939, 1.131, -1.545, 0.525, -0.889, 1.485, -0.636, -0.131, 1.182, -1.141, 1.182, -1.394, 0.879, 1.03, -1.747, 0.02, -1.192, -1.192, 0.475, -1.899, -0.03, -0.535, 1.03, 0.98, -0.384, 1.384, -1.394, -0.737, 0.374, 0.071, 0.828, 0.172, 0.222, 0.98, -1.394, -0.182, -0.081, -0.687, -1.646, 0.576, 0.727, 0.121, -0.485, -0.687, 0.98, -0.03, 1.333, -0.636, 0.424, 1.131, 0.879, -1.646, 1.081, -1.697, 0.626, -1.141, 0.525, -0.434, 1.333, 0.323, 1.03, -0.939, -0.232, 0.677, 0.323, -0.636, -1.444, 1.131, -0.434, 1.586, -0.838, -1.141, -0.485, -1.242, 0.02, -0.03, 0.374, -0.838, 0.98, -1.141, -1.394, 1.485, -0.232, 0.222, 0.121, 0.424, 0.677, -0.081, -1.848, -0.99, 0.273, 0.525, 0.727, 0.828, -0.131, -0.939, 0.778, 0.727, 1.182, 0.727, 0.626, 0.677, 0.879, 1.232, -1.596, -0.131, 1.232, 0.626, 1.384, 0.172, -1.848, -0.03, -1.848, 0.222, 0.778, 1.182, 0.828, -1.04, 0.525, -0.283, -0.485, 0.071, -0.737, 0.727, -1.242, -0.283, 0.576, 1.081, -1.747, 0.071, 1.182, 0.071, 1.333, -1.495, -1.242, 0.98, -1.04, -0.283, 1.434, -0.737, 0.626, 0.879, 0.172, 1.434, -0.636, -0.434, 1.182, 0.475, -1.495, 1.03, 0.576, 0.222, 0.475, -0.788, -1.242, -1.242, -1.545, 0.778, 1.081, -1.242, 1.384, -0.939, 0.626, 0.828, -0.434, 1.687, 1.384, 0.222, -0.283, -1.747, -0.182, 1.535, 0.778, -2.05, 0.677, -0.636, 0.273, 1.535, 0.071, -1.394, 0.467, 0.189, 0.801, -1.867, -0.033, -2.2, 0.69, -0.366, 0.301, -0.366, -0.644, 0.189, -0.144, -0.255, -0.533, 0.301, 0.467, 0.245, 0.856, -0.7, -1.867, -0.311, -1.756, -0.589, -0.033, -2.089, 0.023, 0.467, 0.634, -0.2, 0.078, -0.311, 0.356, -0.311, -1.811, 0.578, -1.033)
fifaRaw <- c(fifaRaw, 0.578, 1.19, 0.356, 1.023, 0.078, -2.033, -0.811, 0.189, -0.589, 1.301, 0.189, -2.256, -1.033, -0.144, -0.533, -0.644, -2.145, 1.301, -1.589, 0.301, 0.69, -1.2, 0.245, -0.644, 1.19, 1.023, 0.134, -0.144, -0.088, -0.866, 0.301, -0.255, 0.467, 0.467, 0.634, -0.644, 0.134, 0.578, -2.089, -0.255, -0.477, 0.856, 1.523, 1.412, 0.634, -0.366, 0.69, -2.2, 0.301, -0.2, -0.033, 0.801, 1.245, 1.19, 0.801, -0.033, -0.533, -0.033, 1.856, 0.356, -0.644, 0.412, 0.467, 0.801, -0.477, 0.578, -0.033, 0.134, 0.523, 0.023, -0.533, 0.856, 1.023, -2.2, 0.967, 0.023, -2.2, 0.301, 0.189, 0.912, -0.255, 0.245, 0.467, 0.523, 0.023, -1.756, -0.088, -2.256, -0.477, 1.301, 0.412, 1.134, -2.145, -0.255, 0.189, 0.301, -1.644, 1.19, 0.523, -0.033, -2.2, 0.078, 1.078, 0.412, 0.523, -0.422, 0.856, -1.089, 0.912, 0.356, -2.145, 1.245, -0.589, 0.745, 0.467, 0.745, 1.078, 0.245, 0.578, -0.2, 1.245, 0.634, 0.801, 0.856, 0.301, 0.69, 0.801, -1.756, -0.033, 1.412, -2.089, -2.089, -2.145, 0.023, -0.255, -0.255, -2.256, -0.755, -0.2, 0.245, 0.134, -0.255, 0.523, -0.644, -2.256, -0.7, 0.912, -0.311, 0.523, 0.356, 0.745, -0.366, 0.134, 1.023, 0.967, -2.145, -2.033, 0.301, -0.589, 0.801, 0.078)
fifaRaw <- c(fifaRaw, -0.922, -2.422, -2.033, -0.422, -0.589, -0.533, 1.356, -0.422, 0.634, 0.412, 0.856, -0.255, 0.301, 1.134, 0.856, 0.301, 0.912, -0.311, 0.023, 1.19, -0.088, -0.422, 0.578, 0.023, -0.866, 0.467, 1.412, 0.745, -0.978, 0.356, -0.755, -2.311, 0.356, 1.134, 0.69, 0.967, -1.756, -0.533, -0.811, -2.089, 1.245, 0.301, -1.867, -0.7, 0.745, 0.578, 1.023, -0.755, 1.134, -0.144, -0.477, 0.245, -0.422, 0.412, 0.856, 0.356, 0.856, 0.356, 1.356, 0.245, -1.756, -0.311, -0.811, 0.467, -2.2, -0.422, 0.412, 0.189, 0.356, 0.412, 0.356, 0.078, 0.69, 1.634, 1.69, -1.922, 0.523, -0.033, -0.644, 0.134, 1.19, -2.089, -0.088, 0.967, -2.2, 0.801, 1.301, 1.023, 0.69, 0.245, 1.523, -2.256, 0.412, 0.189, 0.245, 0.467, -1.922, -2.2, -0.033, 0.912, 0.245, -0.422, -1.922, 1.412, -0.477, -2.089, 0.301, 0.801, -0.589, 0.412, 0.023, 0.578, 0.245, 0.745, 0.189, 0.356, 0.967, -0.755, -0.033, 1.023, -0.644, -0.033, -0.144, 0.634, -2.256, 0.189, 1.579, -0.033, 0.078, 1.19, 1.023, 0.078, 0.856, -1.978, -0.589, -0.144, -2.2, 0.578, 0.356, 0.967, -0.033, -2.2, 1.134, 1.134, 0.912, 0.134, 0.523, -0.311, 0.189, 0.578, 0.412, -0.144, 1.245, -0.255, 0.078, -0.422, -2.256, -0.811, -0.144, 0.912, -2.256, 0.356, 0.634, 1.245, -0.144, 1.023, 0.023, 0.023, 0.801, 1.579, 0.634, 0.912, 0.412, -1.756, 0.801, -1.867, -0.922, -0.533, 0.245, 0.856, -0.255, -0.644, -0.2, 0.023, 0.801, -0.422, 0.745, -0.033, -2.256, 1.023, 1.467, 0.745, -0.033, 1.023, 0.412, -1.2, 1.023, 1.245, 0.69, 0.69, 0.301, -0.589, -2.033, 0.134, 0.467, 0.856, -0.033, 0.634, 0.189, 0.189, -1.811, 0.301, 0.356, 0.412, 0.356, 0.356, 0.912, 0.801, 0.801, 0.412, 0.245, 0.69, 0.801, 0.301, -0.866, 1.023, -1.922, 0.189, 0.745, 0.523, 0.356, -0.311, -2.256, 0.912, -2.311, -0.477, 0.301, 0.745, 0.967, 1.078, 0.801, 1.301, 0.634, -0.2, 0.634, 0.801, 0.578, 0.189, -0.533, 0.245, -2.2, 0.578, 1.412, 0.023, 1.301, -1.978, -2.2, 0.523, -0.2, 0.467, 0.856, -0.589, -0.422, 0.023, 0.412, 1.19, 0.912, -0.422, 1.634, -0.7, -2.2, 0.578, 0.467, -0.755, 0.023, 0.634, -1.7, -1.756, -1.978, 2.19, -0.366, 0.912, 1.301, 0.467, -0.033, -0.7, -0.811, 1.467, 1.412, -0.2, 1.134, 0.578, -0.811, 1.078, 0.134, -2.534, 1.023, -0.477, -0.311, -0.144, 0.467, -0.533, 0.061, 0.26, 0.458, -2.386, 0.26, -2.121, -0.137, -0.203, 0.193, 1.318, 0.26, 0.723, 0.723, 0.392, 1.252, -1.063, 1.119, 0.921)
fifaRaw <- c(fifaRaw, 0.855, -1.063, -1.923, 0.656, -0.865, -1.261, 0.127, -2.055, 0.326, 0.524, 0.656, 1.252, -0.931, -1.658, 0.061, 0.656, -1.592, 0.193, -0.005, 0.127, -0.005, 1.053, 0.326, 0.789, -0.732, 0.458, 0.656, 0.656, 0.127, 0.855, -1.923, 0.193, 1.45, 0.392, 0.458, -1.658, 0.59, -1.923, 0.855, -0.071, -1.46, 0.524, 0.524, 0.789, 0.392, 0.723, 0.127, 0.855, -0.005, -1.394, 0.656, 0.458, -0.269, -0.203, 0.26, 2.243, 0.59, -1.724, 0.921, -0.203, 0.723, 0.59, 0.392, -0.666, 0.127, -0.005, -2.187, 0.127, -1.658, -0.666, -1.526, 0.458, 0.59, 0.26, 0.127, 0.921, 0.524, 0.789, -1.195, 0.921, -0.6, -0.402, -0.203, 0.789, 0.458, -1.261, 0.061, -0.402, -1.989, 0.392, 0.193, 1.516, -2.452, -0.931, 1.119, -2.253, -0.6, 0.723, 0.656, -0.534, -0.137, -0.005, 0.458, 0.789, -0.137, 0.855, -2.319, -1.658, -0.005, 1.384, 0.789, -1.46, -0.005, -0.6, -0.6, -0.137, 0.458, 0.193, 0.127, -2.386, 0.26, -0.203, 0.987, 0.59, 1.45, 0.987, 0.193, 0.326, 1.053, -1.327, 0.921, 0.458, -0.203, 0.193, 0.127, 0.392, 1.185, 0.26, 0.127, 1.053, 0.59, -0.203, 0.723, 0.326, 0.193, 0.193, -1.857, 0.59, -0.6, -1.724, -2.055, -1.195, 0.656, 0.326, -1.658, -1.526, -0.137, 0.458, 0.326, 0.524, 0.392, 0.326, 0.127, -2.452, 0.326, 0.392, 0.392, 0.127, 0.921, 0.061, 0.127, 0.392, -0.6, 0.789, -2.187, -2.716, -1.526, 0.987, -0.071, 0.127, 1.252, -0.931, -0.402, 0.127, 0.458, 0.392, 0.524, 0.061, 0.193, 1.252, 0.656, 1.252, 0.458, 0.061, 0.458, 0.127, 1.119, 0.26, -0.005, -0.071, 0.061, -0.005, 1.053, -0.6, 0.392, 0.656, 1.252, 0.127, 0.921, 0.193, 0.26, -2.386, 0.127, 0.392, -0.005, 1.318, -2.187, 0.458, 0.921, -2.253, 0.656, 0.193)
fifaRaw <- c(fifaRaw, -2.055, 0.524, 1.318, 0.326, 0.656, 0.789, -0.005, -0.732, -1.394, 0.061, 0.392, 0.656, 0.26, 0.855, -0.269, 0.326, -0.269, 0.921, -2.319, 0.326, -0.071, -0.402, -2.187, -0.336, 0.921, 0.326, 0.723, 0.59, 1.45, 0.326, -1.394, 0.656, 0.723, -1.526, 0.524, -0.666, -0.203, 0.458, 1.053, -1.658, -0.402, 0.656, -1.857, 0.392, 0.921, 0.789, 0.855, 0.789, 1.252, -2.319, 1.119, 1.053, 1.185, -1.129, -1.195, -2.518, -0.666, 1.185, 1.252, 0.326, -2.452, 1.516, -0.071, -2.452, 0.656, -0.6, 0.921, 1.714, 0.458, -0.137, 0.458, -0.336, 0.524, 1.318, 0.656, -0.137, -0.997, 0.59, 1.053, -0.402, -1.195, 0.326, -0.666, -0.005, 0.326, 1.185, -0.137, 0.855, -0.203, -1.658, 0.326, -1.989, -0.666, -0.336, -2.386, 0.127, -0.269, 0.524, 0.59, -2.187, 0.392, 0.524, 0.524, 0.987, 0.061, 0.458, -1.658, 0.855, 0.855, 0.458, 0.987, -0.137, 0.723, 1.252, -0.865, 0.061, 0.326, 0.127, -1.526, -0.997, 0.921, 0.656, 0.26, -0.137, 0.789, 0.26, 1.119, 1.185, 0.656, -1.195, 0.789, -1.46, -0.402, -2.782, 0.855, -0.534, 0.26, 1.053, 0.524, -0.732, 0.193, -0.269, 0.392, 0.326, 0.193, -0.005, -2.055, 0.987, -0.865, 0.326, -0.798, 0.855, -0.137, -1.989, 0.458, -0.203, 0.26, 0.789, 1.252, -0.997, -2.386, 0.921, 0.326, 0.326, 0.458, 0.723, 0.458, 1.384, -2.253, -0.005, 0.723, 0.061, 1.582, 0.326, 0.127, 0.723, -0.666, 0.524, 0.524, 1.318, 0.656, 0.193, 0.26, 0.458, -2.848, 0.127, 0.921, 0.326, 1.318, 0.392, -1.989, 0.789, -2.253, -0.865, 1.185, -0.732, 1.318, -0.137, 0.855, 0.458, 0.193, 0.061, 0.392, 1.252, -0.402, 0.061, 0.193, 0.987, -2.716, 0.326, -0.203, 0.59, 0.59, -1.592, -1.79, -1.394, -0.402, 0.524, 0.392, -0.269, -0.865, 0.458, 0.59, -0.402, 0.26, 0.127, 0.987, 0.524, -1.923, 0.127, 0.723, -0.137, 0.458, -0.666, -0.468, -2.716, -2.716, 1.318, 0.987, -0.071, 0.59, -0.071, 0.789, 0.061, 0.326, 0.723, 0.987, 0.26, -0.137, -1.195, -0.666, 0.723, 0.26, -2.584, 0.987, 0.061, 1.053, 1.384, -0.005, -1.46, 1.011, 0.622, 0.733, -1.488, -0.044, -1.599, -0.877, 0.456, -0.266, 0.345, -0.044, 0.345, 1.289, 0.456, 1.177, 0.011, 1.233, 1.622, 1.177, -0.933, -1.544, 0.289, -1.599, -0.711, -0.6, -1.544, 0.955, 1.066, 0.9, 0.511, -0.766, -0.6, -0.377, -0.211, -1.321, 1.233, 0.511, -1.544, -1.099, 0.789, -0.655, 0.233, -1.821, 0.011, 0.178, 1.066, 1.455, 0.289, -1.655, 0.011, 1.122, -0.322, 0.233, -1.655, 0.178, -1.266, 0.289, -1.099, -2.043, 0.122, 0.955, -0.488, 0.622, -0.044, 0.178, -0.6, 0.233, -0.877, 0.567, 1.344, 1.122, 0.844, 0.567, 2.233, 0.4, -1.377, 0.789, 0.955, 0.122, 1.4, -0.655, -1.21, 0.011, 0.233, -1.544, 1.233, -0.711, -0.766, -0.766, 0.233, 0.4, 0.067, -0.933, 0.9, 0.955, 2.288, -0.6, 1.066, -0.711, 1.455, 0.567, 0.067, -0.266, -0.877, -0.766, -0.655, -0.877, 0.122, -0.155, 1.622, -1.821, -1.155, -0.1, -1.821, -0.933, 0.955, 1.122, 0.733, 1.122, 0.733, -0.488, 0.456, -1.21, 0.567, -1.599, -1.21, -0.433, 0.955, 0.4, -1.21, 0.178, -0.711, -0.711, 0.345, 0.4, 0.233, 0.345, -1.655, -0.322, -0.766, 0.789, -0.155, 1.455, 0.345, 0.289, -0.988, 0.456, -1.655, 1.4, 0.678, 0.456, 1.344, -0.1, -1.599, 0.456, 0.511, 0.844, 1.011, -0.155, -0.711, 0.511, 1.177, -0.377, -0.6, -2.099, 0.511, 0.678, -1.266, -1.488, -1.599, 0.178, 1.066, -0.822, -1.766)
fifaRaw <- c(fifaRaw, -0.544, 0.844, 1.289, 1.011, 0.178, -0.1, 0.289, -1.821, 0.733, -1.044, 1.233, -0.044, 1.289, -0.655, -0.6, 0.011, -1.044, 1.289, -1.932, -1.488, -0.544, 0.233, 0.733, -0.766, -0.044, -1.877, -1.766, -0.044, 0.622, -1.044, 0.733, -0.655, 1.566, 0.122, 1.622, 1.4, 1.011, -0.488, -0.266, 0.289, 2.01, 0.678, 0.789, -0.377, -0.711, -0.544, 0.844, -0.488, -0.211, -0.711, 1.622, -1.044, -0.044, 0.844, 0.178, -1.821, -0.6, -0.711, 0.733, 0.678, -1.655, 1.066, 1.566, -1.988, -0.766, 0.456, -1.599, 0.122, 1.344, 0.622, 1.844, -0.155, -0.877, -0.322, -0.6, 1.122, 0.067, -0.433, -0.433, 1.177, -1.266, 0.289, -0.877, 1.455, -1.988, -0.155, -0.211, -0.711, -1.988, -0.211, -0.044, -0.6, 0.678, -0.044, 0.4, 0.178, 0.622, -0.655, 1.844, -1.766, 0.4)
fifaRaw <- c(fifaRaw, -0.988, 0.067, 0.844, 1.733, -1.544, -0.377, 1.177, -1.599, 0.955, 1.677, 1.788, -0.711, 1.233, 2.344, -1.766, 0.067, 0.955, 0.844, 0.233, -1.488, -1.988, 1.122, -0.544, -0.6, 0.4, -1.377, 1.899, -0.377, -1.821, 0.511, -0.711, 1.677, 1.955, -0.988, -0.433, 1.289, 0.011, -0.377, 1.788, 0.011, -0.266, -0.433, 0.289, 1.455, 0.067, -0.822, 0.678, -1.544, -0.544, -1.377, 2.233, -0.322, -0.266, 1.455, -0.766, 0.678, -1.488, 0.511, 0.9, -1.821, -1.599, -0.933, -0.488, 0.289, -1.932, -0.211, -0.488, 0.955, 0.733, -0.433, 1.177, -0.822, -0.822, -0.1, 0.122, 1.122, -0.711, 0.345, 1.289, -1.488, -0.155, -0.044, -0.766, -1.21, -0.211, 0.844, 0.067, -0.711, -0.766, 0.733, -0.488, 1.566, -0.377, 0.511, 1.566, 0.844, -1.377, 0.345, -1.71, 0.789, -1.21, 0.678, -0.1, 0.955, -0.266, 0.789, -1.099, -0.711, -0.155, 0.622, -0.766, -1.544, 1.899, -0.377, 1.233, -0.488, -0.377, -0.655, -1.21, -0.377, 0.345, 0.955, 0.622, 0.289, -0.988, -1.321, 1.788, -0.1, 0.456, 0.9, 1.4, 0.678, 0.511, -1.71, -0.766, 0.289, -1.044, 0.178, 0.9, -0.155, -0.044, -0.044, 0.511, 1.289, 1.4, 1.066, 0.622, 1.011, 1.289, -1.544, -0.433, 1.4, 0.233, 0.955, -0.155, -1.988, -0.544, -1.544, 0.678, 1.511, 0.511, 1.011, -0.877, 0.067, -0.433, 0.622, -0.266, -0.655, 0.955, -1.155, 0.4, 0.289, 1.011, -1.655, 0.122, 1.011, 0.233, 1.289, -1.544, -1.432, 0.289, 0.4, -0.155, 1.455, -0.6, -0.044, 0.9, 0.233, 1.566, -0.711, -0.488, 1.4, -0.044, -1.599, 0.289, 0.9, -0.322, 0.011, -0.655, -1.21, -1.377, -1.599, 1.344, 1.011, -0.766, 1.122, -1.155, 0.4, -0.211, -0.155, 1.788, 1.066, -0.044, -0.1, -0.655, -0.655, 1.733, 0.345, -1.766, 0.678, -0.544, -0.1, 1.677, 0.067, -0.766, 0.824, 0.302, 0.824, -1.837, 0.093, -2.15, -1.107, 0.458, 0.615, 0.719, -0.22, 0.928, 0.406, 0.615, 1.293, -0.585, 0.771, 1.397, 0.771, -0.324, -2.046, 0.458, -1.89, -0.272, -0.063, -2.255, 0.615, 0.406, 0.51, 0.824, -1.055)
fifaRaw <- c(fifaRaw, 0.197, 0.302, 0.615, -1.837, 0.25, 0.406, 0.354, -0.168, 0.876, 0.145, 0.563, -2.255, 0.458, 0.928, 1.189, 0.667, 0.615, -2.203, 0.145, 1.137, 0.563, 0.563, -2.15, 0.093, -1.785, 0.98, -1.159, -2.307, 0.667, 0.302, 0.563, 0.719, 0.928, 0.145, 0.876, 0.458, -1.107, 0.667, 0.667, 0.197, 0.406, 0.667, 1.397, 0.458, -2.203, 0.719, 0.406, -0.481, 0.667, -0.376, 0.197, 0.51, 0.093, -2.203, 0.667, -1.42, -0.063, -1.211, -0.585, 0.25, -0.116, 0.25, 0.51, 0.667, 0.406, -1.003, 1.084, 0.719, 0.667, 0.145, 0.771, 0.51, -1.368, 0.876, -1.42, 0.145, 0.771, -0.533, 0.51, -2.359, -0.846, 1.084, -2.411, -0.846, 0.667, 0.771, 0.406, 0.458, 0.771, 0.145, 0.824, -1.942, 0.719, -2.255, -1.472, -1.211, 1.189, 0.093, -1.89, 0.667, -0.585, -0.324, 0.563, 0.145, -0.116, 0.51, -2.359, -1.159, -0.168, 0.719, 0.302, 1.867, 0.876, 0.406, 0.771, 0.041, -2.046, 0.98, 0.563, 0.093, 0.615, 0.041, -0.22, 0.615, 0.824, 0.406, 0.667, 0.719, -0.063, 0.51, 0.51, 0.041, 0.406, -1.577, 1.084, 0.302, -1.785, -2.046, -2.098, 0.458, 1.084, 0.25, -2.046, 0.406, 0.406, 1.032, 0.667, 0.354, 0.093, 0.51, -2.359, 0.667, 0.406, 0.615, 0.354, 0.197, -0.846, 0.041, 0.145, -0.742, 0.458, -2.359, -1.942, -1.316, 0.615, 0.719, 0.563, 0.51, -1.994, -2.255, 0.145, 0.563, 0.458, 0.51, 0.25, 0.719, 0.197, 0.98, 1.397, 1.032, -0.116, 0.876, 0.041, 1.293, 0.563, 0.563, -1.003, 0.145, -0.324, 0.667, -0.324, 0.51, -0.168, 0.876, -0.481, 0.615, 0.458, 0.667, -2.255, -0.168, 0.041, 0.51, 1.032, -2.098, 0.667, 0.876, -2.516, 0.354, 0.667, -2.203, 0.145, 0.928, 0.302, 0.51, 0.458, -0.481, -0.324, 0.145, 0.615, 1.137, 0.458, -0.585, 0.51, -1.159, 0.458, -0.898, 0.928, -2.307, 0.51, 0.197, -1.159, -2.203, 0.197, -0.116, 0.25, 0.458, 1.032, 0.876, 0.197, 0.145, -0.376, 0.563, -2.307, 0.354, -0.585, 0.51, 0.824, 1.241, -1.89, -0.168, 0.719, -2.203, 0.771, 1.137, 0.667, 0.98, 0.771, 0.928, -2.463, 0.302, 0.25, 1.189, -0.063, -2.046, -2.307, -0.168, 0.563, 0.719, 0.615, -1.837, 1.45, 0.093, -2.255, 0.615, -0.481, 0.98, 1.658, 0.145, -0.272, 0.98, -0.116, -1.003, 1.397, 0.615, 0.667, -0.481, 0.667, 0.615, 0.145, 0.041, 1.241, -1.89, 0.719, -0.324, 1.345, 0.041, -0.063, 0.563, -1.524, 0.041, -2.046, 0.615, 0.615, -2.307, 0.406, 0.406, -0.742, 0.615, -2.307, -0.22, -0.637, 0.615)
fifaRaw <- c(fifaRaw, 1.032, -0.116, 0.667, -1.263, -0.22, 0.98, 0.51, 0.876, -0.011, 0.51, 1.084, -1.942, -0.168, 0.197, -1.159, -2.203, -0.637, 1.293, -0.168, -0.063, 0.25, 0.719, -0.063, 1.137, 0.406, 0.458, 0.406, 0.824, -2.046, 0.041, -2.098, 1.345, -0.272, 0.145, 0.406, 0.928, -0.063, 0.563, -1.055, 0.302, 0.406, 0.145, -0.011, -2.203, 0.824, -0.481, 0.824, -0.168, 0.25, -0.011, -1.263, 0.25, -0.063, 0.406, 0.458, 0.615, -0.637, -1.733, 1.189, 0.145, -0.324, 0.615, 0.615, 0.563, 1.137, -2.098, 0.145, 0.667, 0.302, 0.824, 0.824, -0.116, 0.771, -0.22, 0.51, 0.876, 1.137, 0.51, 0.667, 0.354, 0.302, -2.046, 0.458, 1.032, 0.771, 0.824, 0.093, -2.15, 0.041, -2.255, 0.51, 0.563, 0.51, 0.824, -0.846, 0.667, -0.429, 0.406, -0.116, -0.011, 0.563, -1.211, 0.302, 0.51, 0.98, -2.307, -0.533, 0.145, 0.719, 0.667, -2.098, -2.046, 0.25, -0.272, -0.063, 0.615, -0.324, -0.22, 0.719, 0.458, 0.98, -0.116, -0.116, 0.719, 0.458, -1.89, 0.041, 1.032, 0.563, 0.458, -0.794, -1.629, -2.203, -2.203, 0.458, 0.928, -0.429, 0.667, -0.324, 0.719, 0.458, 0.354, 0.824, 1.084, 0.563, -0.168, -0.742, 0.093, 1.241, 0.719, -2.15, 0.563, -0.116, 0.51, 1.345, 0.145, -0.898, 0.902, 0.902, 0.635, -1.657, -0.164, -1.87, -0.804, 0.742, 0.582, 0.795, 0.369, -0.271, 0.902, 0.582, 1.275, -0.378, 1.915, 1.595, 1.222, -0.964, -1.55, -0.538, -1.71, -0.751, 0.635, -1.604, 1.381, 0.582, 1.275, 0.902, -1.177, -0.431, -0.271, -0.591, -1.71, 1.008, -0.218, -0.964, -1.337, 1.062, -0.697, 0.209, -1.604, 0.209, 1.275, 1.275, 0.422, 0.529, -1.764, -0.324, 1.861, -0.111, 0.582, -1.657, 0.102, -1.177, 0.529, -1.124, -1.924, 1.168, 0.902, 0.422, 0.209, 0.529, 0.049, -0.431, 0.742, -0.697, 1.222, 0.742, 0.369, 0.369, 0.155, 2.128, 0.689, -1.444, 0.955, 1.328, -0.697, -0.058, -1.017, -1.231, 0.689, -1.337, -1.817, 0.848, -0.911, -0.697, -0.911, -1.124, -0.484, -0.111, 0.689, 1.062, 1.328, 0.635, -1.071, 0.155, -0.431, -0.164, -0.431, 0.635, 0.209, -1.177, -0.378, -1.071, -0.538, 1.062, -0.591, 1.915, -1.817, -1.284, 1.115, -1.71, -1.284, 1.008, 1.275, 0.582, 0.848, 0.049, 0.742, 1.541, -1.497, 1.168, -1.71, -1.231, -0.218, 1.488, 0.902, -1.39, 0.422, -0.751, -0.644, 0.315, 0.209, -0.058, 0.049, -1.337, -0.644, -1.337, 0.209, 0.102, 1.328, 0.209, 0.315, -0.964, 0.848, -1.87, 0.689, 0.422, 0.742, 0.689, -0.164, -0.484, 0.902, 0.955, 1.062, 0.902, 0.422, -0.697, 0.582, 0.582, 0.209, -1.071, -1.231, 0.848, -0.111, -1.39, -1.71, -1.87, 0.102, 0.369, -0.484, -1.764, -0.111, 0.475, 0.582, 0.795, -0.004, -0.324, 0.315, -1.764, 0.102, -0.271, 1.168, -0.378, 0.955, -0.004, -0.644, 0.582, -0.804, 0.475, -1.817, -1.817, -0.857, 1.115, -0.697, 0.049, 0.102, -1.977, -1.764, 0.262, 1.062, -0.004, 0.742, -0.911, 0.102, 1.861, 1.755, 1.648, 0.262, -0.484, 0.582, -0.111)
fifaRaw <- c(fifaRaw, 1.915, 0.315, 1.115, -1.124, -0.697, -0.164, 1.115, -0.857, 0.102, 1.168, 1.861, -0.857, 0.529, 0.635, 0.582, -1.87, 0.102, -1.284, -0.111, 0.635, -1.604, 0.689, 1.488, -1.924, -0.697, 0.369, -1.497, 0.315, 1.435, 0.635, 0.262, 0.475, -0.591, -0.484, -0.591, 0.635, 1.488, -0.218, -0.857, 0.848, -0.697, -0.058, -1.124, 0.795, -1.87, -0.484, 0.102, -0.804, -1.924, -0.324, -0.538, -0.644, -0.644, 0.475, 0.422, 1.062, -0.378, -0.538, 0.209, -1.817, 0.422, 0.102, 0.049, 1.541, 1.328, -1.657, 0.742, 1.062, -1.39, 0.155, 1.488, 0.475, 0.848, 1.222, 1.435, -1.764, 1.541, 0.155, 1.648, -0.964, -1.817, -1.924, -0.804, 0.315, -0.218, -0.058, -1.55, 2.128, 0.049, -1.817, 1.008, -0.644, 1.488, 2.341, -0.804, -0.964, 0.795, -0.591, -0.484, 1.595, 0.848, 0.102, -0.164, 1.381, 1.381, 0.102, -0.697, 0.155, -1.87, -0.324, -0.431, 1.701, -0.591, 1.381, 0.529, -0.804, 0.209, -1.444, 0.049, 0.475, -1.87, 0.529, -0.644, -0.751, 0.155, -1.657, -0.218, -0.751, 0.635, 1.008, -0.111, 1.168, -1.337, -0.058, 0.422, -0.164, 0.848, -0.697, 0.315, 1.328, -1.604, -0.271, 0.742, -0.857, -1.977, -0.591, 1.008, -0.271, -0.538, -1.017, 0.529, -0.538, 1.168, -1.284, 1.115, -0.431, 0.369, -1.817, -0.271, -1.817, 1.115, -0.911, 0.689, 0.262, 0.742, -0.004, 0.795, -0.857, 0.529, 0.422, 1.381, -0.804, -1.444, 2.074, -1.444, 0.262, -0.218, 0.209, 0.262, -0.804, -0.804, 0.529, 1.008)
fifaRaw <- c(fifaRaw, 0.049, 1.008, -0.538, -1.444, 1.222, -0.164, -0.378, 1.222, 1.328, 0.369, 1.275, -1.817, -0.644, 0.529, 0.529, 1.062, -0.004, -0.538, -0.058, -0.911, 0.529, -0.697, 1.915, 0.529, 0.689, 1.222, 0.902, -1.497, 0.848, 1.062, 0.369, 1.008, 0.635, -1.924, -0.644, -1.657, 0.369, 0.955, -0.697, 0.155, -1.231, 1.115, -0.218, -0.644, 0.422, -0.538, 0.475, -1.071, 0.315, 0.209, 1.595, -1.55, -0.484, 0.369, 1.222, 0.689, -1.604, 0.475, -0.697, 0.848, -0.004, 1.168, -0.697, -0.857, 1.275, 0.475, 0.955, -0.591, -0.644, 0.049, 0.582, -1.444, -0.271, 1.168, 0.635, 0.529, -1.124, -1.231, -0.857, -1.817, 1.488, 1.435, -0.857, 0.369, 0.529, 1.168, 0.955, -0.111, 0.369, 1.755, 0.955, 0.155, -0.751, -0.164, 0.475, 0.422, -1.71, 0.742, -0.484, 0.848, 1.541, 0.155, -1.231, 1.237, 0.747, 0.747, -1.432, -0.015, -1.649, -0.887, 0.148, 0.257, 0.91, -0.015, -0.07, 1.292, -0.124, 1.292, -0.342, 2.381, 0.856, 1.128, -0.941, -1.595, 0.856, -1.432, -0.669, 0.311, -1.704, 1.346, 0.039, 1.455, 1.401, -0.724, -0.397, -0.451, -0.724, -1.432, 0.747, -0.124, -0.941, -1.05, 0.747, -0.941, -0.07, -1.649, -0.615, 1.401, 1.237, -0.179, 1.183, -1.486, -0.669, 1.891, -0.342, 0.856, -1.54, -0.015, -1.159, 0.039, -0.996, -1.595, -0.669, 1.618, 0.529, 0.148, 0.475, 0.257, -0.015, 0.856, -0.669, 0.529, 0.148, -0.179, 0.093, -0.233, 2.272, 0.42, -1.704, 0.91, 1.891, -0.451, 0.747, -0.07, -0.615, 0.202, -1.05, -1.595, 1.019, -0.724, -0.615, -0.887, 0.693, -0.124, -0.397, -0.288, 0.91, 1.618, 0.42, -0.724, 0.148, -0.124, 0.202, -0.778, -0.233, 0.475, -1.159, -1.105, -0.669, -0.669, 0.42, -1.05, 2.163, -1.704, -1.214, 1.019, -1.649, -0.941, -0.506, 1.237, 1.074, 0.747, -0.56, 1.401, 1.51, -1.486, 1.401, -1.323, -0.832, -0.179, 0.856, -0.015, -1.649, -0.179, -0.506, -0.288, 0.039, 0.039, 0.257, -0.342, -1.105, -0.778, -0.941, 0.747, 0.366, 1.455, 1.401, 1.237, -0.506, 0.366, -1.649, 1.128, 0.747, -0.451, 0.638, -0.451, -0.179, 0.747, 0.802, -0.288, 1.292, 1.128, -0.56, 0.693, -0.179, 0.638, -0.615, -1.323, 1.237, -0.288, -1.486, -1.595, -1.54, 0.366, 0.693, -0.179, -1.54, -0.615, 1.074, 0.475, 0.91, 0.039, -0.179, 0.965, -1.704, 0.747, 0.093, 1.074, -0.397, 1.237, -0.615, -0.342, 0.366, -0.832, 0.856, -1.758, -1.323, -0.724, 1.237, -0.615, -0.124, -0.179, -1.813, -1.758, -0.179, 1.292, -0.288, 0.311, -0.451, -0.07, 1.673, 1.292, 1.455, 1.128, -1.05, 0.148, 0.202, 1.891, 0.148, -0.669, -0.832, -0.669, -0.07, 1.128)
fifaRaw <- c(fifaRaw, -0.56, 0.91, 1.727, 1.891, -1.377, 1.401, -0.506, 1.183, -1.649, -0.451, -0.941, -0.778, 0.856, -1.432, 0.529, 1.401, -1.758, -1.05, 0.42, -1.323, 0.747, 1.564, 1.074, 0.257, 0.093, -0.451, -0.724, -0.724, 0.148, -0.124, -0.56, -0.778, 0.91, -0.124, 0.148, -0.887, 1.51, -1.649, -0.451, -0.669, -0.342, -1.54, -0.342, -0.124, -0.615, -0.615, 0.42, 0.475, 0.475, -0.397, 0.093, 0.366, -1.758, -0.124, -0.996, -0.015, 1.074, 0.584, -1.704, 0.638, 1.128, -1.649, -0.233, 1.292, 0.638, -0.288, 1.836, 0.747, -1.486, 1.618, 0.747, 1.836, -1.105, -1.595, -1.758, -0.887, -0.07, -0.288, -0.179, -1.268, 2.435, -0.342, -1.704, -0.07, -0.887, 1.727, 2.272, -0.778, -0.233, 0.257, -0.07, -0.778, 1.618, 0.965, -0.179, 0.148, 1.618, 1.727, -0.451, -0.669, -0.506, -1.268, 0.148, 0.856, 1.51, -0.397, 0.475, 0.965, -1.105, -0.179, -1.105, 1.128, 0.093, -1.595, -0.724, -0.669, -1.214, 0.856, -1.758, 1.019, -0.778, 0.91, 0.856, -0.233, 0.584, -0.778, -0.397, -0.506, -0.179, 1.074, -0.506, 0.311, 0.91, -1.214, -0.288, 0.42, -0.615, -1.649, -0.724, 0.91, 0.311, -0.015, -0.451, 1.346, -0.288, 1.51, 0.366, 1.019, -0.669, 0.747, -1.867, -0.342, -1.704, 0.747, -0.724, 1.074, 0.257, 1.128, -0.397, 1.074, -0.778, 0.093, 0.802, 1.945, -0.288, -1.649, 2.109, -1.323, 2, -0.451, -0.724, -0.179, -1.758, -0.832, -0.342, 1.836, -0.56, 1.237, -0.778, -1.54, -0.179, -0.015, -0.56, 1.346, 0.965, 0.802, 1.51, -1.268, -0.778, -0.124, 0.747, 1.346, 0.475, -0.56, -0.288, -0.832, 0.202, -0.778, 2.272, 0.802, 1.564, 1.51, 1.128, -1.595, 0.693, 0.42, -0.124, 1.51, 0.91, -1.758, -0.07, -1.432, -0.015, 0.965, -0.724, 0.91, -1.105, 1.074, 0.257, -0.179, 0.093, -0.669, -0.015, -0.56, 0.093, -0.124, 1.891)
fifaRaw <- c(fifaRaw, -1.432, -0.397, 0.148, 1.564, 0.802, -1.649, 0.91, -0.778, -0.778, 0.093, 1.292, -0.56, -0.724, 1.346, -0.506, -0.887, -0.724, -0.342, -0.124, 0.693, -1.649, -0.288, 0.529, 1.074, 0.475, -0.832, -1.105, -1.214, -1.377, 1.618, 1.455, -0.887, 0.093, -0.996, 0.91, 1.401, -0.179, 0.039, 1.836, 0.802, -0.832, -0.778, -0.56, -0.07, -0.506, -1.758, 0.638, -0.233, 1.019, 1.346, -0.015, -0.832, -0.572, 0.269, 0.01, -2.124, 0.463, -1.801, -0.572, 0.075, -0.054, 1.045, 0.398, 0.528, 0.463, 0.463, 1.239, -1.089, 1.369, 0.786, 1.045, -0.96, -2.06, 0.786, -0.507, -1.025, 0.14, -1.348, 0.01, 0.722, 1.11, 1.239, -0.119, -1.348, -0.507, 0.722, -2.06, -0.119, -0.313, -0.184, 0.334, 1.11, 0.075, 1.304, -1.93, 0.722, 0.075, 0.075, -0.507, 0.851, -1.866, 0.398, 1.821, 0.334, 0.398, -1.995, 0.592, -1.93, 0.851, -0.119, -1.219, -0.119, 0.722, 0.98, 0.14, 0.528, 0.14, 0.592, -0.184, -1.477, 0.592, 0.592, -0.96, -0.895, 0.075, 2.598, 0.528, -1.283, 1.11, 0.204, 0.98, -0.637, 0.269, -0.443, 0.657, -1.283, -1.995, 0.075, -1.736, -0.507, -1.801, 0.722, -0.054, 0.528, 0.075, 1.045, 0.786, 0.657, -1.736, 0.851, 0.204, -0.96, -0.313, 1.304, 0.722, -1.866, 0.14, -0.507, -1.542, 0.592, -0.119, 1.692, -1.413, -0.637, 1.239, -2.124, -0.701, 0.592, 0.786, -0.766, -0.507, -1.219, 0.786, 0.851, -0.184, 1.11, -1.995, -1.801, -0.572, 1.433, 1.304, -1.154, 0.14, -0.119, 0.01, 0.14, -0.119, 0.334, -0.054, -2.642, 0.334, -0.054, 0.851, 0.851, 1.369, 1.304, -0.054, 0.463, 1.304, -1.089, 0.463, 0.463, -0.766, -0.119, 0.204, -0.184, 1.175, 0.398, 0.075, 1.498, 0.657, -0.184, 0.916, -0.249, 0.463, 0.98, -0.184, 0.204, -1.542, -0.766, -1.866, -1.154, 0.592, 0.528, -1.283, -1.283, 0.01, 0.657, -0.443, 0.786, -0.054, 0.528, -0.119, -1.866, -0.119, 0.204, 0.463, -1.219, 1.045, -0.249, 0.398, 0.592, 0.075, 1.045, -1.607, -2.189, -1.154, 1.239, -0.766, -0.313, 1.498, -0.96, 0.463, 0.722, 0.398, 0.722, -0.119, 0.14, 0.269, 1.433, 0.916, 1.433, 0.657, 0.592, -0.313, 0.592, 1.11, 0.204, -0.054, -0.119, -0.119, -0.249, 1.045, -0.054, 0.463, 0.463, 0.916, -0.054, 0.592, 0.398, 0.14, -1.93, 0.269, 0.269, -0.766, 1.239, -1.93, 0.463, 1.11, -1.995, 0.722, 0.269, -2.254, 0.786, 1.11, 1.11, 0.916, 0.851, -0.119, -1.348, -1.477, -0.443, -0.766, 1.11, -0.119, 0.786, 0.01, 0.075, -0.054, 0.722, -1.607, 0.398, 0.01, -0.637, -1.801, 0.01, 0.851, 0.14, 0.98, 0.786, 1.239, 0.786, -1.672, 0.334, -0.507, -1.801, 0.528, -0.119, -0.313, 0.722, 0.398, -1.219, -0.378, 0.851, -1.089, -1.089, 0.657, 0.657, 0.398, 0.98, -0.378, -1.672, 1.498)
fifaRaw <- c(fifaRaw, 1.175, 1.369, -1.219, -0.96, -2.254, -0.831, 1.369, 0.98, -0.119, -1.995, 1.692, -0.184, -2.383, 1.11, -1.154, 0.916, 1.757, 0.269, -0.054, -0.637, -0.766, 0.916, 1.11, 0.528, 0.204, -1.348, 0.657, 1.304, -0.507, -1.025, 0.463, -0.443, -0.766, 0.528, 1.045, 0.14, 1.821, -0.831, -1.283, -0.766, -1.801, 0.204, -0.184, -2.383, 0.398, -0.443, -1.154, 0.786, -1.995, 0.463, 0.98, 0.916, 0.851, 0.398, 0.722, -1.477, 1.045, 0.98, 0.398, 1.304, -0.184, 0.851, 1.433, -1.283, 0.204, 0.592, -0.054, -1.477, -1.025, 0.98, 0.657, -0.184, 0.14, 1.304, 0.14, 1.175, 1.045, 0.916, -1.283, 0.98, -1.93, -1.283, -2.512, 0.851, -0.507, 0.528, 0.98, -0.054, -0.701, -0.378, -0.313, 0.334, 0.528, 0.463, 0.075, -1.607, 1.175, -0.507, 1.045, 0.01, 0.916, 0.01, -2.124, 0.463, 0.398, 0.592, 0.01, 1.627, -0.766, -1.93, 0.334, 0.528, 0.592, 0.334, 0.916, 0.592, 1.369, -2.06, -0.054, 0.98, 0.334, 1.304, -0.378, -0.054, 0.98, -1.995, 0.398, -0.831, 1.627, 0.851, 0.269, 0.592, 0.075, -2.383, 0.204, 0.657, 0.528, 1.239, 0.592, -2.448, 0.851, -1.413, -0.895, 1.433, -1.154, 1.498, -0.443, 1.045, 0.657, 0.592, 0.657, 0.98, 1.11, -0.313, 0.075, -0.572, 0.592, -2.642, 0.075, 0.14, 0.269, 0.269, -1.542, -1.477, -1.348, 0.14, 0.528, 0.204, -0.119, -1.348, 0.334, 0.592, -0.96, 0.398, 0.14, 0.463, 0.722, -1.801, -0.96, 0.398, -0.184, 0.398, 0.14, -0.637, -2.706, -1.866, 1.563, 0.98, 0.204, -0.249, 0.334, 1.045, -0.313, 0.463, 0.334, 0.722, 0.528, -0.054, -0.831, -0.701, -0.443, -0.313, -1.607, 1.175, 0.14, 0.851, 1.433, -0.119, -1.542, 0.613, 0.213, 0.556, -2.13, -0.358, -2.073, -0.53, 0.556, 0.156, 0.842, -0.358, 0.842, 0.442, 0.385, 1.071, -0.072, 0.899, 1.471, 0.899, -0.53, -2.302, 0.556, -0.987, -0.873, 0.042, -2.245, 0.385, 0.613, 0.613, 0.842, -0.701, -0.53, 0.213, 0.671, -1.616, 0.213, 0.213, 0.156, -0.015, 0.956, 0.27, 0.556, -2.073, 0.499, 0.899, 0.956, 0.099, 0.671, -1.902, 0.213, 1.242, 0.556, 0.385, -2.187, 0.27, -2.302, 1.128, -0.816, -1.502, 0.556, 0.556, 0.671, 0.728, 0.671, 0.442, 0.842, 0.042, -0.93, 0.213, 0.671, 0.099, -0.015, 0.385, 1.871, 0.385, -2.473, 0.842, 0.442, 0.213, 0.956, 0.213, 0.328, 0.442, 0.213, -2.016, 0.613, -1.387, -0.13, -1.673, 0.442, 0.385, 0.442, 0.213, 0.671, 0.613, 1.128, -0.987, 0.956, -0.015, 0.385, 0.385, 0.728, 0.27, -1.159, 0.213, -1.502, -0.987, 0.556, -0.873, 0.956, -2.473, -0.473, 1.242, -2.302, -0.244, 0.499, 0.671, 0.042, 0.442, 0.499, 0.27, 0.613, -1.101, 0.671, -1.844, -1.444, -0.587, 1.071, 0.556, -2.187, 0.156, -0.473, -0.187, 0.27, 0.556, -0.015, 0.156, -2.187, -1.273, -0.701, 0.842, 0.556, 1.757, 0.899, 0.042, 0.613, 0.785, -2.302, 0.842, 0.728, -0.015, 0.671, -0.187, 0.27, 0.956, 0.499, 0.328, 0.842, 0.728, -0.015, 0.613, 0.556, 0.042, 0.328, -2.016, 0.842, 0.27)
fifaRaw <- c(fifaRaw, -1.73, -1.959, -1.844, 0.385, 0.499, -0.987, -1.559, 0.042, 0.385, 0.842, 0.499, 0.099, 0.156, 0.499, -2.416, 0.499, 0.442, 0.556, 0.27, 0.499, -1.044, -0.072, 0.042, -0.644, 0.728, -2.645, -2.359, -1.387, 0.613, 0.842, 0.213, 0.499, -2.073, -2.645, 0.156, 0.499, 0.042, 0.556, -0.015, 0.613, 0.899, 0.956, 1.357, 0.556, 0.042, 0.556, 0.156, 1.299, 0.213, 0.499, -0.301, 0.156, -0.187, 0.671, -0.187, 0.328, 0.613, 0.842, -0.072, 0.213, 0.27, 0.442, -2.245, 0.156, 0.385, 0.328, 1.185, -2.302, 0.442, 1.014, -2.302, 0.213, 0.499, -1.73, 0.613, 0.899, 0.27, 0.613, 0.328, -0.301, -0.415, -0.873, 0.328, 0.842, 0.613, -0.587, 0.556, -0.301, 0.099, -0.415, 0.785, -2.416, 0.27, -0.015, -0.987, -2.187, -0.072, 0.328, 0.385, 0.613, 0.956, 1.357, 0.328, 0.27, -0.072, 0.728, -1.844, 0.499, -0.587, 0.27, 0.785, 1.071, -1.844, -0.072, 0.899, -2.245, 0.899, 1.014, 0.842, 0.956, 0.785, 1.071, -2.588, 0.671, 0.785, 1.128, -0.072, -2.302, -2.416, -0.244, 0.785, 0.956, 0.156, -2.13, 1.528, 0.099, -2.702, 0.785, -0.301, 0.785, 1.871, 0.27, 0.042, 1.014, -0.015, -0.015, 1.299, 0.671, 0.499, -0.415, 0.385, 0.671, -0.072, 0.27, 0.785, -1.844, 0.613, 0.042, 1.357, -0.13, 0.613, 0.499, -1.559, 0.042, -1.844, 0.328, 0.328, -2.13, 0.213, 0.156, -0.53, 0.613, -2.588, -0.015, -0.415, 0.613, 0.728, 0.099, 0.728, -1.273, 0.442, 0.899, 0.499, 0.956, -0.13, 0.556, 1.128, -2.13, -0.13, 0.328, -0.244, -1.559, -0.987, 1.185, 0.328, 0.042, 0.213, 0.671, -0.13, 1.128, 0.499, 0.442, 0.728, 0.671, -1.959, 0.213, -2.359, 0.956, -0.587, 0.156, 0.613, 0.728, -0.415, 0.556, -0.758, 0.213, 0.156, 0.099, -0.015, -1.33, 0.842, -0.473, 0.728, -0.244, 0.671, -0.13, -1.33, 0.328, 0.27, 0.442, 0.728, 0.899, -0.93, -2.588, 1.185, 0.042, 0.156, 0.442, 0.613, 0.27, 1.014, -1.959, 0.156, 0.671, -0.415, 0.956, 0.442, 0.042, 0.728, -0.301, 0.499, 0.728, 1.185, 0.556, 0.385, 0.442, 0.613, -2.416, 0.156, 0.728, 0.842, 0.899, 0.099, -2.073, 0.099, -2.073, -0.301, 0.785, 0.385, 1.014, -0.13, 0.842, 0.156, -0.13, 0.099, 0.042, 0.671, -1.101, 0.156, 0.156, 1.014, -2.13, 0.213, -0.187, 0.671, 0.613, -2.187, -2.245, 0.099, 0.042, 0.042, 0.671, -0.244, -0.187, 0.442, 0.842, 0.842, -0.13, -0.644, 0.613, 0.556, -2.416, -0.13, 1.071, 0.499, 0.442, -0.873, -2.53, -2.416, -2.53, 1.528, 1.014, -0.072, 0.613, 0.099, 0.842, 0.099, 0.213, 0.613, 1.014, 0.27, -0.187, -0.873, -0.187, 1.071, 0.442, -2.13, 0.842, -0.072, 0.556, 1.471, 0.213, -1.73, 1.784, 0.282, 1, -2.003, -0.044, -1.481, -1.22, 0.152, 1.066, 0.543, 0.413, -0.044, 0.021, 0.478, 0.87, -0.044, -1.872, 1.523, 0.543, 0.739, -1.415, -0.436, 0.021, -0.044, 0.347, -1.611, 1.066, 0.413, -0.044, 0.739, -0.762, 0.282, 0.543, -0.24, -1.546, 0.478, 1, 0.347, -0.371, -0.175, 0.347, 0.282, -2.591, 0.347, 1.262, 1.653)
fifaRaw <- c(fifaRaw, 0.021, 0.086, -2.199, -0.11, 0.217, 1.719, 0.282, -0.893, -1.024, -1.481, 1.392, -0.11, -1.742, 0.87, 0.478, 0.086, 1.196, 0.152, 0.87, 0.217, 0.347, -0.175, 0.674, 0.805, 1.066, 0.543, 1.066, 0.021, 0.543, -1.481, 0.674, 0.805, -0.697, 0.217, -0.762, -0.24, 1.066, 0.413, -1.938, 0.739, -0.305, 0.674, -0.371, 0.282, -0.632, -0.958, 1.719, -0.697, 1.196, -2.134, 0.805, 1.719, 0.674, 1.392, 0.282, 0.152, 0.543, 0.282, 0.609, -0.762, -0.24, 0.805, 0.282, -1.481, -2.656, -0.632, 0.152, -2.656, -0.632, 1.327, 0.152, 0.543, 0.674, 0.739, -0.175, 0.021, -1.22, 0.739, -1.938, -0.11, 0.086, 0.87, -1.285, -0.175, 1.327, 0.021, 0.805, 1.066, -1.154, 0.282, 0.347, -1.22, 0.021, -0.11, 0.478, 0.413, 1.914, 0.543, 0.935, 0.282, 0.413, -1.415, 0.543, -0.371, -1.024, -0.501, 0.674, -1.677, -1.089, 1.066, 0.805, -1.154, 0.282, 0.152, 0.347, 0.805, 0.217, 0.413, -0.044, 1.327, -1.481, -1.024, -1.415, -1.742, 0.413, 1.98, 0.413, -2.134, 0.739, 0.152, 1.523, 0.739, 0.282, 0.282, 0.805, -3.113, 0.609, 0.347, 1.131, 1.327, 0.152, -0.044, 0.086, 0.217, -1.872, 0.87, -2.199, -1.22, 0.021, 0.086, 1, 0.478, -0.24, -1.807, -0.762, 0.282, 0.282, 0.805, -0.567, 0.282, 0.805, -1.611, 1.262, 1.523, 1.066, -1.611, 0.674, -0.828, 0.805, 0.543, 1.196, -1.089, 0.347, 0.152, 0.543, 0.152, 0.478, -1.481, 1.131, -1.154, 0.282, 1.327, 1.196, -2.068, 0.413, -0.567, 1.131, 0.217, -1.481, 1.327, 0.217, -2.721, 0.086, 0.152, -1.677, 1.066, 0.282, -2.068, -0.044, 0.674, -0.567, 0.413, 0.217, 0.609, 1.327, -0.371, -0.567, 0.739, -2.068, 0.217, 0.086, 0.282, -2.003, 0.543, 0.478, -1.546, -2.525, 0.086, 0.413, 0.609, 0.021, 1.262, -0.762, 1.588, -1.35, 0.347, 0.152, -0.632, 0.805, -1.22, 0.478, 1, 0.87, -2.656, 0.674, -0.893, -0.632, 0.935, 0.674, -0.893, 0.674, -0.044, -1.481, -2.591, -0.697, -0.958, 0.674, 1.262, -2.068, -2.591, 0.217, -0.828, 0.152, 0.282, -1.154, 0.87, 1.196, -2.46, -0.24, -0.893, 1, 0.739, 0.413, 0.282, 0.805, 0.021, 0.021, 1.066, 0.347, 0.413, -0.567, 0.674, 0.739, 0.87, 0.478, 1.784, -0.697, 0.086, -1.024, 0.935, 0.021, 0.478, -0.305, -0.501, -1.415, -1.807, 0.478, 0.282, -2.525, 1.066, 0.805, 0.935, 0.674, -2.003, -0.893, 0.282, -1.35, 1.523, -0.762, 1.066, -0.436, 0.805, 0.347, 0.935, 0.347, -0.11, 0.805, -0.305, -1.677, 0.021, 0.347, -0.24, -1.285, -0.371, 1.196, -0.24, 0.478, -0.371, 0.021, -0.11, -0.697, 0.021, 0.086, 0.021, 0.347, -1.807, -0.305, -2.134, 0.739, 0.674, -0.371, 0.739, 1.196, 1.262, 0.739, -0.632, 0.021, 0.674, -0.436, -0.11, -2.329, -1.024, -2.068, 1.653, -0.11, -0.24, 0.478, -1.938, 0.347, -0.697, 0.87, 0.674, 0.086, 0.543, -1.807, 1.719, 0.152, -1.154, 1.066, -0.371, 0.086, 1.327, -1.546, 0.609, 0.87, 0.152, 1, 1.131, -0.24, 0.935, -1.415, 1.131)
fifaRaw <- c(fifaRaw, 0.543, 0.609, 0.021, 0.935, 0.021, -0.632, -2.656, 0.478, 0.413, 0.609, 0.347, -0.305, -2.656, -0.893, -2.068, 0.478, -0.436, -0.501, 0.217, 0.543, 1.327, 0.217, 0.674, 0.217, 0.021, -0.501, 0.021, 0.282, 0.739, 0.87, -0.436, 0.152, -0.11, 0.935, -0.175, -0.958, -1.546, -0.24, 1.457, -0.11, 0.152, -0.501, 0.347, 0.674, 0.543, 0.217, 0.347, 0.086, 0.609, -0.24, -1.742, 0.021, 0.347, 0.935, 0.609, 0.347, -1.089, -1.024, -0.567, 0.805, 0.935, -0.567, 0.543, -1.154, -0.11, 1.653, 0.609, 0.217, 0.805, 0.413, 0.282, -0.632, 0.935, 1.653, 1.457, -1.938, -0.11, 0.086, 0.086, 0.674, 0.543, -1.154, 1.292, 0.086, 0.823, -2.057, -0.048, -1.789, -1.186, 0.019, 1.627, 0.421, 0.153, 0.287, -0.048, 0.555, 0.555, 0.019, -2.057, 0.823, 0.555, 0.823, -1.119, 0.22, -1.588, -0.45, 0.488, -2.325, 1.091, 0.488, 0.019, 0.622, -0.584, -0.249, 0.756, 0.086, -1.588, 0.287, 1.493, 0.354, 0.019, -0.249, 0.622, 0.153, -2.057, 0.756, 1.493, 1.56, 0.555, 0.086, -2.66, -0.316, -0.182, 1.895, 0.22, -1.454, -0.784, -1.32, 1.359, -0.784, -1.655, 0.756, 0.488, 0.287, 1.091, -0.249, 0.756, 0.287, 0.622, -0.584, 0.019, 0.756, 0.823, 0.019, 0.354, -0.115, 0.153, -0.985, 0.354, 0.756, -0.918, 0.354, -0.918, 0.019, 0.756, 0.354, -1.588, 0.354, 0.019, 0.421, -0.048, 0.086, -0.182, 0.22, 1.895, -0.383, 1.024, -2.124, 0.153, 1.627, 0.689, 1.56, 0.488, 0.287, 0.823, 0.555, 0.89, -0.784, 0.22, 0.89, 0.89, -0.249, -2.459, -0.048, -0.584, -2.861, -0.383, 1.426, 0.22, 0.689, 0.823, 0.89, -0.048, 0.153, -1.521, 0.957, -1.789, -0.115, 0.555, 0.153, -1.119, -0.249, 1.091, 0.354, 1.024, 1.158, -2.124, -0.182, 0.153, -1.655, -0.249, -1.119, 0.555, 0.488, 1.694, 0.622, 0.287, 1.024, 0.555, -1.387, 0.89, -0.249, -1.253, 0.421, 0.488, -1.588, -0.918, 0.957, 0.89, -0.985, 0.22, 0.555, 0.555, 1.292, 0.756, 0.287, -0.517, 1.158, -0.851, -0.985, -0.918, -1.387, -0.182, 1.962, 0.488, -1.454, 0.756, 0.287, 1.359, 0.823, 0.153, 0.086, -0.115, -2.794, 0.421, 0.823, 0.823, 1.091, -0.048, -0.182, 0.153, 0.354, -0.784, 0.555, -2.593, -1.253, -0.115, -0.45, 0.823, 0.622, 0.287, -3.129, -0.918, -0.115, 0.555, 0.756, -0.182, 0.019, 1.761, -2.325, 1.158, 1.292, 0.756, -1.655, 0.354, -1.32, 0.957, 0.823, 0.89, -0.851, 0.421, 0.22, 0.153, 0.019, 0.22, -1.789, 1.225, -1.387, 0.488, 1.493, 1.225, -2.258, 0.756, -0.45, 1.56, 0.22, -0.985, 0.957, 0.153, -2.459, 0.153, 0.22, -1.856, 0.823, 0.354, -0.651, 0.22, 0.488, -0.048, 0.421, -0.784, 0.957, 1.225, -0.249, -0.249, 0.689, -2.124, 0.823, 0.622, 0.153, -2.526, -0.115, 0.354, -1.119, -2.258, 0.421, 0.488, 0.555, 0.153, 1.426, -0.784, 1.024, -1.052, 0.421, 0.689, -1.387, 0.689, -0.718, 0.22, 0.622, 0.823, -2.526, 0.622, 0.019, -0.316, 1.158, 0.823, -0.918, 0.823, 0.22, -1.32, -2.459, -0.584, -0.985, 0.689, 1.359, -0.383, -2.727, 0.354, -0.048, 0.354, 0.086, -1.052, 0.823, 1.56, -3.263, 0.421, -0.182, 1.024, 0.555, 0.287, 0.354, 0.622, 0.153, -0.115, 0.22, 0.421, 0.488, -0.918, 1.091, -0.182, 1.158, -0.584, 1.694, -1.052, 0.22, -1.588, 0.756, 0.019, 0.689, -0.115, -0.517, -0.784, -1.789, -0.182, 0.488, -1.722, 1.024, 0.622, 1.091, 0.622, -2.258, -1.253, 0.622, -0.651, 1.426, -1.387, 0.823, -0.249, 0.555, 0.555, 0.689, 0.354, 0.019, 1.091, -0.115, -1.454, -0.249, 0.287, -0.115, -1.655, -0.584, 1.426, -0.45, 0.019, -0.048, 0.287, 0.354, -0.718, 0.153, -0.249, 0.086, 0.22, -0.784, -0.784, -2.392, 0.555, 0.421, -0.651, 0.756, 1.225, 1.426, 0.689, -1.32, 0.756, 0.086, -0.718, -0.784, -1.856, -1.99, -1.186, 1.761, 0.689, -0.316, 0.555, -2.191, 0.957, -0.048, 0.957, 0.89, -0.383, 0.354, -1.856, 0.957, 0.287, -1.454, 0.823, -0.851)
fifaRaw <- c(fifaRaw, 0.153, 0.89, -1.119, 0.488, 0.823, -0.182, 1.024, 0.488, -0.048, 1.024, -1.588, 1.426, 0.555, 0.622, -0.182, 0.89, -0.45, -0.249, -2.861, 0.488, 0.622, 1.024, -0.651, 0.488, -2.526, -0.182, -2.191, 0.354, -0.651, -0.115, 0.287, 0.421, 0.823, 0.488, 0.622, -0.115, -0.182, -0.651, -0.316, 0.354, 0.287, 1.024, -0.651, 0.421, -0.584, 0.689, -0.249, -0.182, -1.253, -0.383, 1.225, -0.115, 0.354, -0.115, 0.622, 1.024, 0.153, 0.421, 0.22, -0.316, 0.622, -0.517, -1.32, -0.048, 0.89, 1.091, 0.287, 0.22, -0.918, -1.186, -0.918, 0.756, 0.756, -0.115, 0.89, -0.584, -0.651, 1.627, 0.756, 0.019, 0.354, 0.22, -0.249, -0.584, 1.225, 1.627, 1.627, -1.99, -0.383, 0.22, -0.316, 0.89, 0.689, -0.918, 1.582, 0.508, 1.247, -2.515, -0.298, -1.709, -0.366, 1.045, 0.306, -0.097, -0.567, 0.441, 0.575, 0.172, 0.776, 0.373, 0.037, 1.65, 0.978, 0.709, -0.97, 0.105, -0.164, -0.164, 0.239, -1.373, 0.642, 0.441, 1.247, 0.911, -1.507, 0.306, 0.239, 0.373, -1.507, 0.172, 0.306, 0.306, -0.836, 0.105, 0.239, -0.164, -2.717, -0.701, 0.844, 1.717, -0.164, -0.231, -1.843, 0.844, 0.776, 1.314, 0.172, -0.97, -0.836, -2.179, 1.65, -0.231, -1.642, 0.844, 0.508, -0.097, 1.381, 0.105, 0.306, 0.037, 1.045, -1.104, 0.373, 0.441, 1.112, 0.709, 0.709, 0.508, 1.045, -3.254, 1.045, 0.642, 0.172, -0.903, -1.642, -0.366, 1.515, -0.164, -1.172, 0.642, -0.298, 0.642, -1.44, 0.037, 0.037, -0.231, 1.784, 0.239, 1.448, -1.104, -1.037, 2.053, 0.709, 1.582, 0.172, -0.164, 0.508, -0.903, 0.508, -2.112, 0.172, 1.045, 0.306, 0.441, -2.112, -0.433, 0.978, -2.045, -1.44, 0.844, 0.239, 0.844, 0.642, 0.306, 0.776, 0.441, -0.97, 1.247, -1.239, -1.037, -0.634, 1.515, -0.567, -1.507, 1.045, -0.97, 0.105, 1.314, -1.642, 0.239, 0.508, -0.164, -1.239, -0.567, 0.575, -2.246, 1.784, 0.575, 0.373, -0.03, 0.911, -0.97, 0.776, 0.575, -0.298, 0.844, 0.306, -0.433, -0.634, 1.247, 0.709, -0.366, 0.239, -0.164, -0.164, 0.373, 0.373, 0.105, -0.701, 0.978, -0.97, -1.507, -1.575, -2.112, -0.231, 1.784, -0.298, -0.366, 0.239, 0.172, -0.164, 0.776, -0.5, 0.642, 0.508, -2.246, 0.239, 0.037, 1.784, 0.776, 0.105, -0.164, 0.105, -0.097, -0.231, 0.373, -2.582, -1.978, -0.903, 0.709, 0.978, 0.508, -0.097, -1.373, -0.769, 0.105, 0.508, 0.239, 0.911, 0.105, 0.844, -1.172, 0.844, 1.381, 0.776, -1.642, 0.642, 0.239, 0.508, 0.844, 1.247, -0.903, 0.575, -0.298, 0.844, -0.03, 0.911, -2.112, 0.306, -1.642, 1.179, 1.448, 1.851, -2.717, -0.366, -1.44, 0.776, 0.373, -1.709, 0.575, 0.373, -2.582, -0.03, 0.575, -1.373, 1.112, -0.03, 0.844, -0.231, 0.373, -0.634, 0.373, -0.164, 0.306, 1.045, 0.441, 0.373, 0.709, -0.836, 0.037, -1.911, 0.373, -2.045, -0.231, 1.717, -1.776, -1.978, 0.575, 0.037, 0.642, 0.978, 1.045, 0.239, 0.373, -0.433, -0.433, -0.097, -1.843)
fifaRaw <- c(fifaRaw, 0.441, -0.433, 0.776, 1.247, 1.045, -2.381, 1.179, -0.366, -0.366, 0.844, 0.978, 0.373, 0.037, 0.441, -0.164, -2.851, 0.508, 0.105, 0.978, 0.441, -0.03, -2.314, -0.836, -0.298, -0.231, -0.836, -1.978, 0.911, 0.844, -1.575, 0.037, -0.097, 1.314, 1.112, 0.037, 0.239, 0.978, -0.97, -1.306, 1.515, 0.575, 0.642, 0.306, 1.112, -0.298, 0.373, 0.508, 1.381, -1.172, 0.441, -1.507, 1.515, 0.844, 0.306, -0.03, -1.172, -1.978, -2.314, -0.366, 0.172, -1.911, 0.037, 0.441, 0.911, 1.247, -1.642, -1.642, 0.373, -0.701, 0.978, 0.105, 0.978, -0.836, 0.776, 0.508, 0.642, 0.239, -0.567, 1.381, 0.642, -1.575, 0.306, 0.508, -0.5, -1.104, -0.5, 1.112, -0.097, 0.575, 0.239, 0.306, 0.239, -0.5, 0.172, 0.373, -0.231, 0.239, -1.172, -0.903, -1.978, 1.381, -0.298, 0.105, 0.239, 1.045, 1.448, 1.247, -0.5, 0.441, 0.642, -0.5, -0.298, -1.911, -0.836, -2.045, 1.247, -0.097, -0.5, 0.441, -2.448, -0.567, -0.567, 1.112, 0.441, 0.105, 0.306, -1.575, 1.851, -0.433, -0.298, 0.911, 0.373, 0.373, 0.978, 0.172, 0.172, 0.709, -0.03, 1.247, 2.053, -0.634, 0.373, -0.903, 0.441, 0.105, 0.709, 0.508, 1.314, 0.844, -0.097, -1.709, 0.575, 0.911, 0.776, 0.642, 0.373, -2.515, -0.298, -1.776, -0.231, 0.844, -0.231, -0.03, -1.239, 1.112, 0.172, 0.575, 0.575, -0.366, -0.836, -0.903, -0.164, 0.978, 0.978, 0.105, -1.037, 0.105, 1.112, 0.105, -0.298, -1.709, -0.366, 0.776, -0.231, 0.037, -0.567, -0.836, 0.642, 0.776, 0.239, -1.172, -0.164, 0.441, 0.306, -1.978, -0.5, 0.172, 0.776, 0.978, -1.44, -0.231, -0.903, -1.575, 1.045, 1.381, -1.239, 0.239, -0.567, 0.642, 0.844, -0.164, 0.441, 0.575, 0.105, -0.298, -0.567, 0.306, 1.784, 1.247, -1.642, 0.105, -0.164, 0.709, 1.179, -0.03, -1.776, 0.662, -0.582, 0.21, -0.921, -1.034, -0.017, -0.356, -1.599, -0.017, 0.21, -1.938, 0.662, -0.356, 0.323, 1.34, -1.034, 1.453, 2.018, 0.662, -0.921, -0.921, 0.662, 0.323, -1.599, -0.582, -0.243, -0.13, -0.243, 1.114, 1.453, -0.695, -0.243, -0.356, -2.164, 1.905, 1.114, -0.695, 1.34, 0.323, 1.227, 1.227, 0.097, 0.323, -0.017, 1.34, 0.775, 0.662, -0.017, 1.114, -1.034, 1.34, 0.436, -0.921, 1.114, 0.436, 1.001, 0.549, -0.243, -0.582, -0.017, -0.243, 1.453, 0.323, 1.001, -0.243, 0.21, -0.808, -0.808, -1.712, -0.13, -0.582, -0.243, -0.017, 3.035, 0.097, 0.888, -0.243, 0.662, 0.323, 0.662, 0.436, -1.26, 0.549, -0.356, 1.114, 0.436, -1.373, -1.034, -1.486, 1.453, 0.775, -0.582, -0.017, -0.469, -0.582, 1.679, -1.486, 0.323, -0.017, -0.13, -0.13, 0.097, 0.323, -2.164, -0.356, -2.051, -0.808, -1.486, 0.549, 0.436, -2.051, 0.21, 1.679, -2.051, -0.469, 1.453, 0.097, -3.068, -0.13, -0.017, 0.549, 0.888, 0.436, 0.662, 0.097, -2.39, 1.227, 1.679, 0.662, -1.938, -0.017, -1.034, -0.582, 0.21, -0.017, -0.469, -2.503, 0.888, -0.808, 0.323, 0.436, -0.469, 1.34, 0.775, -1.938)
fifaRaw <- c(fifaRaw, 0.775, 0.662, 1.453, 1.001, -0.808, -0.582, 0.21, -0.695, 0.097, 1.453, 0.888, 0.436, 0.662, -0.017, 0.21, 0.436, 0.549, -0.582, 0.21, 0.549, 0.21, -0.808, 0.323, 0.549, 0.775, -0.808, 1.114, -0.243, -0.582, -0.469, 0.097, -0.017, 0.21, -0.582, 0.097, -0.469, -2.277, -0.582, -0.017, -0.921, -0.808, 0.549, -0.469, -0.695, -1.034, 0.436, 1.114, -1.599, -1.034, -1.26, 0.549, 0.436, -0.017, 1.566, -1.147, -0.017, -1.147, 0.323, 0.097, 0.21, -0.921, 0.323, 1.34, 0.323, 0.549, -0.243, 0.436, 0.549, 0.097, 2.018, -0.469, 0.775, -0.017, -0.695, -1.034, 0.549, -1.599, -1.373, 1.227, 1.114, 0.097, -0.13, -0.13, -0.356, -1.825, 0.21, -0.017, -0.243, 1.114, 0.662, 0.323, 0.888, -1.147, 1.114, -1.034, 1.001, -0.13, 0.549, -0.017, 0.549, -1.034, 0.549, -0.695, -0.921, -0.695, 1.227, 0.662, -0.582, 0.097, 0.323, -0.921, 0.549, 0.323, -3.633, -0.356, -2.051, -1.373, -1.599, -0.582, -0.921, 0.21, 0.323, 0.21, 2.583, 0.21, -1.825, 1.566, 1.114, 0.775, -1.486, -1.373, -0.921, 0.21, 1.792, -1.034, -0.017, 1.792, 0.549, 1.114, 1.114, 0.097, 1.679, 1.453, 2.357, -0.695, 0.549, 1.453, 1.114, -1.938, 0.097, -1.147, -1.034, 0.775, 0.549, -0.469, -0.921, 1.905, -0.469, 0.097, -0.582, -0.13, 0.775, 1.453, -0.243, 0.436, 0.21, -0.469, 0.21, 1.566, -0.243, -2.051, -0.695, 0.775, 0.888, -0.469, -0.469, -0.017, 0.775, -0.582, 0.888, 2.131, -0.695, 1.453, 0.775, -1.373, -0.356, -0.017, -0.582, -1.938, 0.549, -0.695, 0.323, -0.017, 0.775, -2.503, -0.695, 0.888, -0.243, -0.921, -0.017, 0.21, -1.26, 0.097, 0.662, 0.21, 1.114, -1.147, 0.436, 0.662, 0.21, -1.486, -0.469, 0.436, 0.323, -1.712, 1.566, 0.662, -1.486, 0.775, 0.21, -0.469, 0.888, 1.566, 0.097, 0.323, -0.243, -0.017, -1.486, 0.323, -1.034, -1.26, -0.582, 1.566, 0.549, -1.825, 0.21, -0.469, 0.549, -0.243, -0.017, -0.695, 0.21, -0.13, -0.243, -0.017, -1.825, 0.662, 0.097, -0.921, 1.227, 0.662, 0.323, 0.775, 1.34, -2.503, -1.825, 1.566, -0.017, -0.582, -0.13, 0.21, -0.582, 1.34, -0.469, -0.243, 0.549, -0.695, 0.662, 0.775, 0.888, 0.775, -1.373, 0.549, 0.323, 2.018, -0.13, -0.469, 0.21, 0.662, -0.017, 0.323, 0.662, 0.323, 0.323, -0.695, -2.503, 0.097, 1.227, -1.712, 0.21, -0.243, 1.453, 0.097, 0.888, 1.227, 1.001, -0.356, -0.695, 1.001, -0.808, 0.097, -0.582, 1.34, 0.21, -0.921, 0.21, 0.775, 1.453, 0.549, 0.436, -1.938, -0.921, -0.243, 1.227, -1.147, -0.582, 0.323, 0.323, 1.114, -0.808, -0.582, 0.21, 0.21, -0.13, -0.921, 1.227, -0.921, -0.017, -1.26, -0.017, -0.017, -0.808, 2.583, 0.775, -1.486, 1.227, -0.582, 1.227, -1.938, -0.695, 1.453, 0.888, 0.436, -0.582, -2.051, -2.277, 1.114, -0.356, -0.356, 2.018, -1.712, -0.243, 0.888, -0.469, -1.147, 1.737, 0.969, 1.249, -0.705, 0.341, -1.403, -0.008, 0.69, -0.426, 0.969, 1.109, 0.551, 0.969, -0.008, 1.109, 0.621, -0.217)
fifaRaw <- c(fifaRaw, 1.667, 0.411, 1.179, -1.333, 0.621, -0.077, 0.132, -0.356, -1.264, 0.551, -0.356, -0.077, 1.039, -1.822, -0.775, 0.062, -1.473, -2.031, 0.83, 0.9, 0.272, -0.636, 0.411, -0.775, -0.147, -1.194, 0.132, 0.9, 1.458, -0.915, 0.132, -2.31, -0.008, 0.551, 0.341, 0.551, -1.543, -0.845, -0.705, 1.877, -0.217, -2.45, 1.179, 0.83, 0.76, 0.83, 0.411, 1.039, -0.636, -0.217, 0.76, 0.76, 0.83, 1.388, 1.109, 0.9, 0.551, 0.9, -1.403, 1.249, -0.147, -0.147, -0.217, -1.264, -0.636, 1.109, -1.822, -1.264, 0.411, -0.008, 1.388, -2.101, 0.411, -0.147, 0.272, 0.969, 0.272, 1.179, -2.45, -0.077, 2.156, 0.132, 1.039, -0.566, 0.621, 0.83, 0.062, 0.062, -2.171, 0.062, 0.83, -0.008, -0.008, -2.729, -0.356, 0.969, -1.054, -0.496, 0.83, -0.496, 0.341, 0.481, 0.341, 0.76, 0.969, -1.752, 0.9, -2.589, -0.496, -1.822, 2.086, -1.194, -1.752, 0.341, 0.76, -0.356, 1.039, -2.31, 0.83, 0.341, -0.636, -0.426, 0.202, 0.481, -2.101, 1.737, 1.179, -0.077, -0.426, 0.9, -1.124, 0.621, 1.249, -0.426, -0.496, -0.636, -0.984, -0.147, 0.76, -0.147, -0.845, -0.426, 0.341, -0.147, -0.356, 0.132, 0.272, -0.705, 0.9, -0.566, -1.473, -1.961, -2.31, -0.915, 1.458, 0.969, -0.217, 1.109, -0.217, 0.83, 1.039, 0.551, 1.249, 1.179, -2.589, 1.179, 0.411, 1.528, 0.202, 0.062, -0.147, 0.411, 0.551, -0.287, -0.077, -2.171, -2.171, 0.202, 0.9, 0.202, 0.76, 0.969, -2.938, -0.496, 0.202, 0.9, 0.341, 0.551, 0.621, 0.062, -2.031, 0.76, 0.969, 0.341, -0.287, -0.008, -0.496, 0.69, 0.341, 0.062, -1.682, -0.287, 0.551, 0.411, 0.062, 1.877, -1.892, 0.202, -1.194, 0.69, 1.318, 1.877, -1.403, -0.008, -1.124, -0.008, 0.341, -0.147, 1.318, -0.147, -2.799, -0.077, -1.543, -1.613, 1.039, 0.132, -0.426, -0.426, 0.481, -1.264, 0.341, 0.9, -0.915, 0.969, 0.132, 0.062, 0.621, -0.636, -0.845, -0.287, 0.551, -2.45, 0.9, 0.551, -0.984, -2.589, 0.411, -0.287, 1.179, 0.132, 1.039, -0.636, 0.551, -0.984, -1.403, -0.356, -0.775, 0.062, -1.892, 0.69, 0.83, 1.039, -2.31, 0.551, -0.845, -0.356, 0.411, 0.551, 1.109, 0.132, -0.287, -0.287, -1.752, 0.551, -0.217, 0.9, 0.132, -0.426, -0.984, -0.147, -0.775, 0.969, -0.566, -2.38, 0.132, -0.008, -1.333, -0.426, -0.147, 1.877, 1.179, 0.272, 0.272, 0.481, -1.124, 0.202, 2.086, -0.566, 1.597, 0.551, -0.636, 0.9, 0.062, -0.217, 1.318, -0.984, 0.202, -2.241, 1.528, 1.318, 0.062, -0.566, -0.496, -0.147, -0.775, 0.76, 1.458, -1.752, -0.356, 0.621, 0.341, 1.318, -1.194, -0.077, -0.287, -0.147, 0.621, -0.984, 0.9, -0.705, -0.426, 0.202, 1.179, 0.202, 0.341, 1.318, 0.411, -1.124, 0.551, 0.69, -1.333, -1.822, 0.341, 1.179, -0.008, -0.077, 0.272, 0.9, 0.132, -0.147, -0.147, 0.481, -0.287, 0.76, -1.264, -0.077, -2.171, 0.969, 1.109, 0.481, 0.272, 1.388, 1.388, 0.621, -0.217, 0.551, 1.528, -0.077, 0.132, -1.752, -0.705, -2.171, 0.411, -1.682, -0.287)
fifaRaw <- c(fifaRaw, 0.272, -1.264, -0.845, -0.775, 1.109, 0.202, -0.287, 0.341, -1.333, 1.597, 0.969, -1.403, 0.272, 0.551, -0.566, 1.737, -0.566, -0.077, 1.039, -0.636, 1.318, 2.226, -0.496, 0.062, -0.636, 0.341, 0.411, 0.481, 0.69, 1.877, 0.481, -0.984, -0.775, 0.272, 1.039, 0.132, 0.551, -0.287, -3.008, -0.356, -0.915, -0.077, 1.318, 0.132, -0.356, -1.054, 0.621, -0.217, 0.062, -1.194, -0.356, -1.054, -0.775, -0.287, 1.109, 1.109, -0.496, -0.077, -0.008, 0.83, 0.411, -0.915, -1.752, -0.077, 0.062, 0.76, 0.341, 0.551, 0.481, 0.621, 0.481, -0.636, -0.287, 0.9, 0.062, 0.83, 0.132, -1.054, -0.705, 1.039, 0.69, -0.356, -0.356, -0.217, -1.613, 0.202, 1.458, -1.054, -0.356, -0.566, 0.481, 0.132, 0.969, 0.202, 1.249, 0.272, -0.426, -0.217, 1.039, 1.109, 0.76, -1.961, -0.077, -0.705, 1.528, 1.249, 0.132, -0.636, 1.18, 0.573, 0.628, -1.854, -0.144, -1.964, -0.695, 0.187, 0.297, -0.199, -0.034, 0.518, 0.408, 1.235, 0.849, 0.077, 1.125, 1.29, 0.959, -1.909, -1.964, 0.573, -1.633, -1.909, 0.242, -1.909, 0.959, 0.573, 1.07, 0.683, -0.806, -1.192, -1.192, -0.971, -1.633, 0.573, -0.034, -1.247, -0.53, 1.014, -0.089, 0.518, -1.192, 0.077, 0.132, 0.408, 1.29, 0.353, -1.743, -0.144, 1.07, -0.364, -0.144, -1.688, 0.739, -1.743, 1.235, -0.144, -1.854, 0.683, 0.959, -0.695, 0.904, -0.144, 0.132, 0.628, -0.144, -0.585, 0.022, 1.621, 0.408, 0.904, 0.022, 1.787, 0.242, -2.019, 0.849, 0.959, -0.971, 1.235, -0.144, -0.916, 0.187, -0.53, -1.688, 1.014, -1.302, -1.743, -0.751, 0.518, -0.089, 0.573, 0.297, 0.242, 0.518, 1.125, -0.806, 0.794, -1.523, 0.849, 0.518, 0.628, 0.132, -1.137, -0.916, -0.916, -1.412, 0.353, 0.022, 1.456, -1.798, -0.585, 0.077, -1.688, -0.806, 1.014, 0.739, -0.585, 1.014, 0.849, 0.518, 1.345, -1.633, 1.4, -1.909, -1.081, -0.695, 0.297, 0.132, -2.129, 0.022, -0.199, -1.026, -0.089, 0.794, 0.628, 0.739, -1.688, -0.585, -1.302, 0.683, 0.242, 1.566, 0.904, 0.132, 0.353, 0.077, -1.633, 0.904, 0.132, 0.187, 0.408, 0.794, 0.022, 0.904, 0.132, 0.959, 1.511, 0.959, 1.125, 0.683, 0.904, -0.144, -0.916, -1.743, 0.518, 0.628, -1.688, -1.798, -1.743, -0.089, 0.849, -1.468, -1.081, 0.242, 0.518, 0.353, 0.463, 0.187, 0.187, -0.199, -1.688, 0.628, -0.364, 0.573, 0.849, 1.29, -1.081, -0.254, 0.187, -1.523, 0.739, -2.074, -1.854, -0.695, 0.408, 0.187, -1.026, -0.034, -2.129, 0.739, -0.254, 0.849, -0.916, 0.739, -0.199, 1.125, 1.345, 1.4, 0.959, 0.849, -0.034, -0.475, 0.077, 1.511, -0.254, 0.849, -0.751, -1.247, -1.578, 0.353, -0.53, 0.297, 1.235, 1.621, -0.53, 0.022, 0.573, 0.518, -1.743, -0.475, 0.463, 0.794, 0.959, -2.019, 0.408, 0.959, -1.743, -0.144, -0.034, -1.854, 0.904, 1.18, 0.739, 1.29, 0.187, -0.144, -0.309, -1.798, 0.518, 0.849, 0.297, -0.695, 0.794, -0.916, 0.463, -0.089, 1.125, -1.743, -0.089, -0.751, -0.585)
fifaRaw <- c(fifaRaw, -1.909, -0.034, 0.959, -1.026, 0.628, -0.53, -0.089, -0.199, 0.683, 0.959, 1.235, -1.909, 0.297, -0.64, -0.475, 0.959, 0.849, -2.35, -0.53, 0.904, -1.743, 0.408, 1.125, 1.125, 0.739, 1.18, 1.456, -1.743, 0.297, 1.125, 1.125, -0.199, -2.129, -1.909, -0.254, 0.628, 1.07, 0.628, -1.688, 1.621, -0.254, -2.185, 0.187, -0.751, 0.959, 1.4, -0.861, 0.408, 0.573, 0.518, -0.64, 0.739, 0.739, -0.53, -0.64, 1.07, 0.959, -0.034, -1.357, 0.904, -0.806, 0.022, 1.07, 1.29, 0.022, 1.456, 0.959, -0.916, 0.849, -1.909, 0.573, 1.125, -1.743, 0.573, -1.247, -1.302, 0.408, -2.019, 0.573, -0.199, 0.794, 0.739, -0.64, 0.518, -1.026, -0.42, 0.408, 0.022, 0.904, -0.254, 0.683, 1.07, -1.633, -0.254, 0.187, -0.64, -1.412, -0.254, 0.794, 0.518, -0.364, -0.53, 1.125, 0.077, 1.18, -0.034, 0.959, 0.849, 0.463, -2.074, 0.077, -1.854, 0.573, -1.688, 0.683, 0.408, 1.235, -0.199, 0.463, -0.806, -0.53, -0.089, 0.628, -0.806, -1.854, 1.787, 0.959, 1.125, -1.357, -0.42, -0.089, -1.743, 0.849, 0.739, 1.621, 0.959, 0.463, -1.357, -1.964, 1.07, 0.408, 0.022, 1.125, 1.235, 0.408, 0.904, -1.798, -1.412, 1.125, -1.302, 0.739, 0.959, -0.695, -0.475, 0.022, 0.739, 0.794, 1.29, 0.904, 0.187, 0.628, 1.014, -2.129, -0.089, 0.683, 0.353, 0.683, 0.297, -1.909, 0.683, -1.854, -0.254, 1.4, 0.683, 1.345, -0.254, -0.089, 0.628, 0.408, -0.034, -0.53, 1.07, -0.806, 0.463, -0.144, 1.07, -1.633, 0.463, 0.794, 1.125, 1.07, -2.24, 0.408, 0.242, 0.187, 0.187, 1.511, -0.585, -0.364, 0.739, 0.573, 1.29, -0.695, -0.585, 0.739, 0.077, -1.854, 0.408, 1.18, -0.144, 0.683, -0.53, -0.53, -1.633, -2.074, 1.345, 0.739, -0.695, 0.573, -1.081, 0.242, 0.408, -0.254, 1.345, 0.959, -0.254, 0.353, -0.144, -0.309, 1.29, 0.463, -1.357, 1.235, -0.089, 0.573, 1.014, -0.089, -0.916, 0.881, -0.454, 0.714, -2.038, -0.454, 0.714, 0.881, -1.204, -1.037, -0.704, -0.787, 0.63, -0.787, 0.38, -1.121, -0.203, 0.13, 0.047, 0.13, -0.203, 0.213, -0.12, -0.037, -0.871, 0.714, -0.037, 0.213, -0.037, 0.047, 0.213, -1.204, -0.62, -0.871, 0.297, -0.037, 1.298, -0.287, 1.881, -1.204, 1.381, 0.63, 0.213, 0.13, -1.538, 1.131, 0.464, 0.881, -1.121, -0.954, 0.297, -2.372, -2.288, -1.288, -0.12, 0.047, 0.547, 1.464, 0.881, -1.288, -0.203, -1.288, 0.797, 0.797, 0.213, -0.454, -0.454, -0.62, 0.213, -1.204, -0.704, 0.38, 1.298, 0.38, -2.955, 0.714, -1.621, -0.537, 0.047, 0.464, 0.547, 0.13, 0.714, 1.298, -1.371, 0.38, 0.213, 0.38, -0.871, -0.287, 1.798, 1.381, 1.047, 1.214, -0.537, 0.881, -2.872, 0.797, 2.131, 1.464, 0.797, -0.037, -1.371, 0.38, 1.131, -0.62, 0.38, -0.287, -0.62, 0.714, 0.797, -1.955, 1.298, 0.714, -0.62, 0.797, 0.047, -0.203, -0.704, 1.214, 0.714, 0.964, -0.12, -2.288, -0.954, -0.287, 0.547, 0.047, -0.12, 1.214, 0.13, -0.287, 0.881, 0.714, 1.131, 0.38, 0.464, -0.287, 0.213)
fifaRaw <- c(fifaRaw, -0.037, 1.965, 0.547, -0.787, -2.288, 0.63, 0.714, 0.797, 1.548, -0.203, 0.797, -0.12, -0.12, 1.464, -0.287, 0.63, 0.213, -0.203, -0.954, -1.454, -0.037, 0.213, -0.037, -0.037, 1.131, 1.548, 0.213, -1.538, -0.287, -0.12, -0.037, -0.954, -1.121, -1.121, -0.704, -0.203, -0.37, 0.547, -1.705, 1.047, -0.203, 1.464, 0.547, -2.538, -1.204, 0.547, 2.048, -0.037, -0.037, 1.381, -0.454, -0.37, 0.797, -1.121, -1.121, -0.704, 0.63, -0.37, -0.287, -0.787, -1.288, -2.455, 0.714, -0.871, -0.12, 0.38, 2.215, -0.871, 1.298, -0.954, 0.13, -0.787, 0.714, 1.214, -0.037, -0.454, -0.203, 0.213, 0.797, -2.705, -0.37, -0.287, 1.214, -0.037, -0.37, 0.38, 0.547, 0.213, -2.038, 0.464, -1.037, -0.537, -0.537, 0.63, 0.297, 1.131, 0.714, -1.371, -1.955, -2.955, 0.63, 0.38, -0.871, 0.797, -0.871, -0.037, 0.881, -0.287, -0.37, 0.047, -0.871, -2.205, -0.537, -0.37, -0.203, -0.037, -2.789, -0.787, 0.38, -1.204, -1.454, 0.797, -0.871, 0.047, -2.038, -0.037, -0.871, -0.203, 2.048, 1.631, -1.204, -0.537, 0.047, 2.215, 0.297, -0.37, -0.203, 0.881, -1.121, -0.704, 1.298, -2.372, 1.965, -0.62, 0.213, -0.287, 1.965, 0.464, 0.38, -0.287, 0.464, -1.121, 0.547, 0.213, 0.13, 0.547, -0.704, -0.454, -0.704, 0.13, 0.547, -2.121, -1.204, -0.62, -0.704, -0.537, -0.12, 1.381, 1.798, -1.454, 1.464, -0.704, 0.964, -0.037, 0.464, -1.037, 0.38, -0.537, 0.38, 0.297, -2.705, -0.037, -0.203, 0.13, -0.037, -0.203, 0.881, -0.287, -0.037, 2.048, 1.047, 0.797, -0.203, -0.704, -1.621, -2.288, 0.047, 0.797, -0.203, 0.797, 0.13, -0.12, 0.297, 1.214, 1.798, -0.787, -2.372, -1.121, 0.547, 1.548, 0.38, 0.547, 0.714, -0.203, 0.13, -0.871, 0.714, -1.037, 0.63, 0.464, 0.797, 0.13, 0.464, 0.63, 0.547, 1.214, -0.537, 0.047, 1.131, 1.047, 0.714, 0.964, -1.621, -0.454, -0.287, -1.371, -0.454, -0.203, 0.464, 0.881, 0.13, 0.13, 0.047, 0.714, 1.464, -1.121, 0.797, -0.12, -0.954, 1.464, 0.213, 1.131, -0.871, 0.881, 1.047, -0.787, -0.871, 0.38, -0.62, 1.047, 0.38, 0.464, -2.372, 0.547, -0.537, -0.12, 0.881, 0.464, -0.37, 0.297, 0.63, -0.787, 0.714, 0.213, -1.037, 0.797, 1.214, 0.464, 0.297, 0.63, -0.12, 0.63, 0.714, 1.965, -0.537, 0.547, -0.954, -1.371, -0.287, 1.047, -0.287, -0.871, -3.289, 0.63, 0.213, -0.62, -1.204, 0.63, 0.547, 1.548, 0.547, 1.548, -0.203, 0.464, 0.881, -1.871, 0.213, 1.131, 0.38, -0.037, -0.454, -1.037, 0.38, -0.704, 0.714, 0.714, -0.62, 0.213, -0.12, -0.37, 0.714, -0.454, -0.454, -0.287, 0.714, 1.298, 0.797, -0.537, 1.298, -2.372, 0.13, 0.047, -0.037, -2.622, 1.381, 1.381, 0.464, 0.547, 0.047, 2.298, 0.714, 1.047, 1.298, 0.38, 1.381, -2.622, -1.204, 0.881, 0.714, -1.288, 0.547, 1.047, -0.704, 2.215, -0.454, -0.454, 0.964, -0.37, -0.537, 0.38, -0.37, 0.13, 1.44, 0.202, -0.151, -2.095, -0.799, -1.153, 0.556, -1.683, 0.792)
fifaRaw <- c(fifaRaw, -0.033, -1.094, -0.917, -0.151, 0.379, 0.556, -0.21, -0.151, 1.145, 0.144, 0.085, -1.27, 1.086, -1.919, -0.151, 0.909, -1.565, 0.438, 1.204, 0.674, 1.086, -0.505, -0.269, 0.556, 0.32, -2.036, -0.033, -0.092, -0.151, -0.092, 0.556, 0.32, 0.556, -2.213, 1.027, 1.44, 0.261, 0.909, 1.322, -1.919, 0.556, 0.202, 0.379, 0.32, -1.27, 0.909, -2.036, 0.85, -0.917, -1.27, -0.21, 0.144, 0.968, 0.615, 0.909, 0.438, -0.092, 0.32, 0.026, -1.683, -0.446, 0.674, -0.563, -0.976, 0.733, 0.144, -2.213, 0.379, 0.733, -0.328, 0.202, -0.387, 0.674, 0.674, -0.505, -1.919, 0.438, -0.328, 0.085, 0.026, 0.85, -0.151, 0.497, 1.734, -0.622, 0.674, -1.153, -0.269, 0.909, 0.968, -0.74, -0.21, 0.556, 0.733, 0.379, 0.438, -0.681, 0.733, 0.085, -0.033, -0.446, -2.861, -0.151, 1.381, -2.272, 0.085, 1.616, 0.202, -1.153, 0.556, 0.144, 0.32, 0.497, -1.977, 1.557, -2.154, 0.085, -0.505, 0.615, 0.438, -1.27, 0.615, 0.379, 0.733, -0.917, 0.379, 1.322, -0.092, -1.801, -0.269, -0.033, 1.204, 0.379, 0.556, 1.086, -0.21, 0.792, 1.499, -1.27, 0.085, 0.202, -0.563, 0.438, 0.615, -0.622, 0.438, 0.144, 1.322, 0.438, 0.674, 0.32, 0.497, 0.438, 0.792, 0.792, -1.035, 0.202, -0.269, -1.86, -1.388, -1.919, -0.033, 0.026, -0.269, -1.506, 0.909, 1.44, 0.674, 0.615, -0.21, 0.909, -0.681, -2.272, -0.622, -0.033, -0.446, -0.151, 1.322, -0.858, -0.622, 0.438, 0.909, 0.792, -2.154, -2.213, -0.269, -0.446, 0.497, 0.261, 0.85, -2.331, -1.624, -1.329, 0.968, 0.438, 1.263, -0.092, 0.497, 0.026, 0.202, 0.615, 0.968, -1.094, 0.615, 0.556, 0.438, 0.438, 1.263, -1.035, -0.092, -0.446, 0.792, 0.085, 0.733, 0.144, 1.557, 0.026, -0.328, -0.269, -0.387, -2.036, 0.792, 0.085, -0.151, 1.263, -1.624, 0.497, 0.261, -2.567, 0.261, -1.742, -1.919, 1.086, 0.674, 0.202, 1.086, 0.085, -0.681, -0.505, -0.21, 0.379, 0.615, 0.615, -0.21, 1.381, 0.144, 0.32, 0.261, -0.446, -2.39, 0.379, -0.563, -1.742, -2.036, -1.801, 0.32, 0.792, 1.381, 1.204, 1.793, 0.32, -1.035, 0.261, 1.145, -2.508, 0.438, 0.144, -0.269, 0.144, 0.497, -2.979, 0.85, 1.675, -1.094, 0.792, 1.086, -1.683, 1.086, 0.909, -0.681, -2.39, 0.026, 0.202, 1.499, 0.32, -2.036, -2.449, -1.683, 1.44, 1.145, -0.976, -2.449, 0.792, 0.379, -2.743, 0.085, -0.033, 0.909, 0.615, 1.145, 0.438, -0.092, 0.202, 0.085, 0.379, 0.615, -0.622, -1.27, 0.497, 1.086, 0.379, 0.026, 0.085, -1.506, -1.094, -0.151, 1.027, -0.622, 0.497, 0.085, -0.387, -0.269, -1.919, -0.092, -0.269, -1.624, 1.145, -0.446, 0.32, 0.085, -2.449, -0.681, 0.85, 0.026, 0.674, 1.675, -0.21, 0.438, 0.32, 1.263, 0.792, 0.909, 0.497, 0.497, 0.615, -1.094, -0.505, 1.027, 0.144, -1.565, -0.505, 0.909, 0.32, -0.21, 0.261, -0.033, 0.556, 0.144, -0.033, 1.499, 0.556, 0.379, -2.095, -0.033, -2.39, 0.32, -0.387, 1.145, 1.145, 0.85, -0.799, 0.379, -0.033, 0.968, -0.269, 0.674, -0.858, -2.036, -0.681, -1.742, 0.792, -0.446, 0.32, -0.21, -2.743, 0.261, 0.144)
fifaRaw <- c(fifaRaw, 1.145, 0.792, 0.144, 0.733, -1.212, 0.909, 0.085, 0.144, 0.792, 0.556, 0.438, 1.086, -1.919, 0.085, 0.556, -0.151, 1.263, 0.026, 0.202, 0.556, -0.446, 1.616, 0.202, 0.85, 0.497, -0.151, 1.322, 0.438, -2.154, 1.322, 0.144, 1.263, -0.033, -0.033, -2.92, 0.792, -1.565, -0.74, 0.674, -0.505, 0.674, 0.32, 1.263, 0.438, 1.381, 1.557, 0.379, 1.44, -0.622, 0.556, 0.497, 0.968, -1.447, 0.615, 0.497, 1.204, 0.379, -1.447, -1.742, 0.026, 0.556, -0.033, 1.322, -0.446, -0.21, 0.085, 1.086, 0.026, -0.799, -0.622, 1.145, 0.144, -1.565, -0.033, 0.909, -0.269, 1.734, 0.379, -1.683, -1.329, -1.624, 1.263, 0.968, -0.21, 0.379, 0.32, 1.027, -0.033, -0.446, 0.261, 0.556, 0.438, 0.085, 0.085, 0.497, 0.968, 0.615, -2.154, 1.204, -0.21, 0.674, 0.85, 0.085, -0.328, -0.138, -0.787, -0.381, -2.164, -1.597, -0.868, 0.753, -2.732, 0.753, -1.111, -1.678, 0.348, -0.057, 0.915, -1.192, 0.429, 0.834, 0.51, -0.3, -0.138, -0.543, 0.753, 0.105, -1.759, -0.057, 0.186, 0.105, 0.591, 0.915, -0.219, 1.158, -0.868, 0.186, 0.591, 1.32, 0.267, -0.543, 0.429, 1.32, 0.105, 0.915, 0.186, -0.868, -0.057, -0.462, -2.326, 1.077, -0.138, -1.759, -0.787, 0.105, -0.706, -0.949, -0.057, 1.077, 0.348, -2.326, -0.138, -1.84, -0.219, 0.186, 0.672, 0.105, 0.105, 0.51, -0.625, -0.625, -2.245, -1.516, 0.267, 0.105, 0.996, -2.488, 0.591, -0.3, -2.002, -0.787, 1.239, 0.51, 2.212, 2.131, 0.915, 0.105, 1.077, -0.138, 0.186, -0.706, -1.678, 1.483, 1.158, 1.077, 1.239, -1.759, -2.651, -0.462, 0.348, 0.672, -2.975, 0.51, -0.138, -0.138, -0.3, 0.51, 0.348, -0.219, 1.564, -0.219, -0.138, 0.834, 0.186, -1.111, 1.32, 0.024, -0.381, 0.51, 0.267, -0.625, -2.164, 0.591, 1.564, 0.024, -0.462, -1.921, 0.024, 0.51, 0.105, 1.726, -1.921, 1.483, 1.239, 0.186, -0.3, 0.51, -1.597, 1.726, 1.402, -0.462, 0.186, -1.435, 0.51, -0.057, 1.077, -1.03, 0.51, -1.759, 0.834, 0.105, -0.3, 0.024, -1.111, -0.381, 0.672, 1.564, 1.158, 0.915, -0.706, 0.753, 1.645, 0.996, 0.834, 0.672, 0.753, 0.834, -0.219, -1.597, -1.273, -0.625, -0.138, 0.267, 0.915, 0.429, -1.354, -1.03, 0.024, -1.759, 0.51, 0.348, 0.591, -1.435, 0.51, -1.678, -0.462, -1.678, 0.348, 0.186, 1.158, 0.672, 0.996, -0.868, -0.625, 1.483, 0.672, -0.381, -0.706, -0.706, 0.186, -0.787, -0.138, -1.759, -1.354, 0.996, -2.651, -0.3, -0.625, 0.672, -0.868, 0.024, 0.591, 1.158, 0.024, -0.462, 1.077, 0.105, 1.077, 0.267, -0.381, 0.186, 1.969, -0.787, 0.51, 0.348, -0.219, -2.245, 0.915, 1.32, 1.564, -0.3, -2.732, -2.894, -1.678, 0.186, 0.834, 0.834, 0.915, -0.3, -1.192, 0.429, -0.219, 0.753, 0.348, -0.057, -0.949, 0.591, 0.024, 0.915, -0.462, 0.672, -0.787, -1.111, 0.429, 0.024, -0.3, 0.348, 0.591, 0.672, 0.105, 1.564, -0.625, -0.057, 0.024, -1.03, 0.186, -0.219, -0.625, 0.51, -0.543, 1.077, 0.105, 0.267, -0.219, 0.834, 1.564, 1.239, -0.949, 0.672, 0.915, -2.326, 0.267, -0.138, 0.591, -0.057, 0.915, -0.057, 0.105, 0.996, -0.462, 0.996, 0.429)
fifaRaw <- c(fifaRaw, 0.915, -1.516, 0.591, 0.672, 0.186, 0.834, -0.3, -1.111, -0.381, 1.158, 0.672, -0.057, 0.834, 0.51, -0.057, -1.192, -0.057, 0.753, -2.245, 0.51, 0.834, 0.267, -0.949, 1.483, -0.219, -0.949, 0.51, -2.245, -1.678, 0.996, 0.024, 0.186, -0.625, 0.348, 0.591, -0.949, 1.888, -1.678, -2.732, 0.996, 0.672, 0.024, 1.807, 0.105, 0.753, -1.111, 0.915, 1.077, -0.381, 0.591, 0.105, -1.759, 1.807, 1.077, 1.645, -0.462, 1.239, -0.625, 0.267, 0.672, -0.219, -0.706, 1.402, -0.138, -1.273, 0.105, 0.267, -2.083, -0.138, 0.996, 0.024, -0.462, 0.348, 0.429, -0.787, 0.591, -1.111, 0.915, 0.915, 0.753, 0.591, 0.186, -0.381, 0.267, -0.462, -0.625, -0.381, -2.083, 0.429, 0.834, -0.381, -1.597, -1.516, 0.591, 0.915, -2.651, 1.402, -0.787, -0.381, 0.996, 2.212, 0.996, -0.787, 0.753, -0.625, -0.381, 0.996, 1.402, -0.381, 0.267, 0.105, 0.105, -2.002, -1.435, -1.435, 1.239, 0.753, 0.834, -0.057, -1.111, -0.138, -0.625, -0.219, 1.564, -0.462, -0.625, 0.915, 0.753, -0.057, 0.834, -0.706, 0.753, -0.381, -0.625, -0.543, 1.077, -0.706, -0.057, -1.111, 1.158, -0.543, 0.51, -1.84, 0.672, -0.057, -0.462, 0.429, 0.51, 0.348, 1.402, -0.219, 0.996, 0.591, 1.077, 0.348, 1.645, 0.429, 0.753, -1.597, 0.591, 0.672, 1.239, 1.158, -0.543, 0.429, -0.381, 0.591, 0.51, 0.024, -0.381, 0.996, -1.597, -1.516, -0.787, 0.429, 0.672, 0.753, -0.787, 1.239, -1.435, -0.057, 0.105, 1.077, -1.597, 0.591, 0.996, -0.3, -0.3, -2.002, 1.402, -0.057, 0.591, 0.672, 0.753, 0.348, -0.706, -0.787, 1.077, -0.138, -0.462, 1.726, -0.138, -1.516, 0.51, -0.219, -0.706, 0.915, -1.354, -0.381, -0.625, -0.3, 1.32, 1.229, 0.426, 0.627, -1.481, -0.377, -1.481, -0.226, 0.326, 0.075, 0.577, -0.477, 0.778, 0.778, 0.527, 1.279, 0.175, 1.229, 1.229, 0.978, -1.13, -1.581, 0.276, -1.33, -1.18, -0.828, -1.531, 0.928, -0.377, 0.727, 0.978, -1.079, -0.979, -0.728, -0.527, -1.481, 0.828, 0.276, -0.728, -1.431, 0.978, -0.477, 0.627, -1.28, 0.276, 0.125, 0.627, 1.129, 0.226, -1.732, 0.226, 1.581, 0.527, 0.125, -1.782, 0.677, -1.481, 1.129, -0.778, -1.933, 0.326, 1.129, -0.577, 1.079, -0.628, 0.426, 0.878, -0.126, -1.33, 0.376, 0.828, 0.577, 0.376, -0.527, 2.283, 0.075, -1.531, 1.029, 1.631, -0.176, 1.079, -0.678, -1.23, 0.125, -1.28, -1.782, 1.029, -1.481, -0.879, -1.029, 0.276, -0.728, 0.727, -0.778, 0.978, 0.928, 0.778, -0.979, 1.029, -0.728, 0.577, 0.426, 1.179, 0.476, -1.28, -0.828, -1.079, -1.079, 0.527, -0.678, 1.48, -1.782, -1.13, -0.226, -1.882, -0.477, 1.179, 0.677, -0.276, 0.828, 0.627, 0.426, 1.43, -1.631, 1.179, -1.732, -1.28, -0.929, 0.878, 0.276, -1.682, 0.276, -0.276, -0.377, -0.076, 0.226, 0.175, 0.727, -1.38, -0.427, -1.431, 0.828, -0.076, 1.882, 1.129, -0.076, -0.226, 0.627, -1.732, 0.778, -0.176, 0.577, 0.577, 0.577, -0.176, 0.878, 0.627, 0.476)
fifaRaw <- c(fifaRaw, 1.279, 1.029, -0.477, 0.627, 0.426, 0.577, -1.18, -1.732, 0.426, 0.727, -1.481, -1.732, -1.631, 0.226, 0.828, -0.678, -1.732, 0.125, 0.627, 0.527, 0.928, 0.075, -0.025, -0.025, -2.033, 0.778, -0.577, 0.727, 0.276, 1.179, -0.728, -0.076, 0.376, -0.527, 0.778, -1.983, -1.631, -1.18, 0.878, 0.476, -0.628, 0.778, -2.133, -1.832, -0.527, 0.878, -1.18, 0.476, -0.226, 1.029, 0.928, 1.38, 1.179, 0.878, -0.377, -0.377, -0.126, 1.631, -0.126, 0.878, -1.481, -0.929, -0.728, 0.677, -0.828, 0.677, 0.677, 1.53, -1.38, 0.276, 0.778, -0.327, -1.933, -1.079, 0.075, 0.476, 0.978, -1.732, 0.627, 1.38, -1.983, -0.276, 0.426, -1.531, 0.376, 1.279, 0.828, 1.43, 0.376, -0.427, -0.025, -0.778, 0.727, 0.878, 0.627, -0.879, 0.577, -1.28, 0.677, -0.678, 1.029, -1.882, 0.125, -0.126, -1.13, -1.983, -0.527, 1.179, -0.778, 1.129, 0.577, -0.327, 0.426, 0.025, 0.075, 1.079, -1.832, 0.376, -1.732, -0.076, 0.527, 1.129, -1.732, 0.025, 1.079, -1.33, 0.727, 0.928, 1.33, -0.577, 1.179, 0.978, -2.033, 0.878, 1.38, 1.129, 0.426, -1.631, -1.933, -0.025, 0.376, 1.33, -0.327, -1.631, 2.082, -0.577, -2.033, -0.226, -1.28, 1.279, 1.781, -1.13, 0.577, 0.778, -0.628, -1.18, 1.681, 0.978, -0.025, -0.076, 0.527, 1.38, 0.276, -0.879, 0.426, -1.732, 0.175, 1.179, 1.229, -0.327, 1.38, 1.079, -1.13, 0.878, -1.581, 0.276, 0.577, -1.732, -0.327, -1.079, -0.828, 0.577, -2.033, -0.377, 0.075, 0.928, 0.527, -0.276, 0.928, -1.33, -0.628, 0.276, 0.125, 0.878, -0.176, 0.527, 1.029, -1.33, -0.025, -0.176, -0.226, -1.481, 0.025, 0.778, -0.176, -0.628, -0.628, 1.279, -0.377, 1.279, 0.025, 1.33, 0.928, 0.978, -1.882, 0.125, -1.732, 0.778, -0.929, 1.129, 0.075, 0.928, -0.276, 0.376, -1.029, 0.376, 0.476, 0.727, -0.527, -1.782, 1.581, -1.23, 1.38, -0.879, -1.18, -0.327, -1.33, -0.427, 0.577, 1.129, 1.079, 1.079, -1.079, -1.431, 1.129, -0.577, 0.075, 1.029, 1.029, 0.778, 0.778, -1.631, -0.628, 1.279, 0.376, 1.179, 0.878, -1.28, -0.628, 0.928, 0.727, 0.828, 1.279, 1.129, 0.928, 1.079, 1.179, -1.481, -0.327, 0.978, 0.727, 1.229, 0.376, -1.882, 0.778, -1.732, 0.276, 1.129, 0.476, 1.43, -0.879, 0.226, 0.025, -0.025, 0.376, -0.276, 1.079, -1.23, 0.677, 0.025, 1.029, -1.682, -0.226, 0.627, 0.326, 0.978, -1.732, -0.076, 0.426, -0.076, 0.276, 1.33, -1.029, 0.175, 0.677, 0.928, 0.928, -0.728, -0.628, 0.527, 0.276, -1.732, 0.426, 0.878, -0.728, 0.978, -0.979, -1.33, -1.732, -1.732, 0.627, 0.778, -1.23, 0.778, -1.029, 0.727, -0.327, -0.527, 1.079, 1.53, 0.577, -0.226, -0.527, -0.678, 0.878, 0.677, -1.933, 1.179, -0.427, 0.727, 1.38, 0.125, -1.13, -0.79, 0.142, -1.373, -2.189, 0.434, -1.315, 0.725, -0.441, 1.016, -0.266, -0.965, -0.557, 0.375, 0.608, 0.026, 0.084, 0.725, 1.424, 0.084, -0.557, -1.723, 1.366, -1.373, -0.266, 0.375, -1.606, 1.075, 1.249)
fifaRaw <- c(fifaRaw, 1.424, 0.492, 0.55, 0.2, 0.375, 0.725, -1.198, 0.375, -0.324, 1.133, 0.725, 1.016, 1.249, 0.492, -1.548, -1.315, 0.725, -1.198, 0.317, 0.084, -1.256, 0.55, 0.841, 0.492, -0.79, -1.14, 1.482, -1.897, 1.249, 0.55, -2.247, 0.026, -0.207, 1.133, -0.441, 0.841, 0.084, -0.207, 0.608, -0.324, -0.732, -0.441, -0.324, 1.075, -1.548, 0.259, 1.133, -1.431, 0.259, -1.14, 0.725, 0.958, 0.608, -0.79, 1.016, 0.084, -0.907, 0.958, -0.207, -0.441, -0.557, -0.033, 0.608, 0.783, 0.958, 0.434, 0.55, 0.841, -0.266, -1.14, 1.016, 0.026, 0.084, 0.2, -0.557, -0.149, 0.958, -0.207, 0.492, -0.615, 0.142, -0.149, -2.13, 0.434, 1.366, -2.189, -0.149, 1.657, 0.2, -1.198, 0.317, 0.026, 0.667, 0.375, -1.839, 1.133, -0.674, -0.382, 0.958, 0.9, 0.725, -1.315, -0.091, 0.608, 0.2, -1.14, -0.615, 1.949, -0.79, -2.072, -0.557, 0.142, 0.841, 0.783, 0.667, 0.317, -0.732, 1.133, 1.016, -1.082, 0.375, -0.557, -1.373, -0.033, 0.55, 0.9, 1.191, -0.091, -0.441, 1.482, 0.259, 0.841, 0.667, -0.266, 1.133, 0.259, -1.082, -0.79, -0.033, -1.606, -0.674, -1.14, 0.55, -0.382, -0.091, -1.023, -0.382, 1.191, -0.441, 0.317, 0.434, 0.492, -0.499, -1.839, -0.965, 0.841, 0.026, 1.133, 0.259, 1.308, -0.091, 0.2, 1.133, 1.016, -1.606, -2.305, -0.557, 0.2, -0.615, 0.667, -1.198, -2.538, -1.315, -0.615, 1.133, 0.958, 0.725, 0.084, -0.674, 0.55, 1.424, 0.55, -1.431, 1.016, 0.667, 0.142, 0.667, -1.489, -0.091, 1.075, -0.441, 0.084, 0.783, 0.2, -1.023, 0.375, 2.24, 0.841, -0.732, -1.315, -0.382, -2.189, 0.841, 1.016, 0.375, 1.191, -2.247, -0.382, 1.075, -1.606, 1.191, -0.79, -1.548, 0.026, -0.091, 0.55, 0.841, -0.557, 0.783, -1.373, -0.441, -1.373, -1.489, 1.249, -0.033, 1.716, 0.783, -1.664, 1.075, 0.317, -1.839, 0.434, -2.072, 0.084, -1.606, -0.324, 0.9, 0.317, -0.033, 0.841, 1.308, 0.841, 0.434, 1.949, 1.249, -1.839, -0.033, 0.2, -1.256, 1.133, 0.55, -1.082, 0.026, 1.308, -1.198, -1.606, -0.965, 0.55, 1.075, 1.133, 1.599, -2.189, 0.958, 1.89, 0.841, -0.149, -1.198, -2.305, -1.315, 1.424, 1.308, -1.373, -2.305, 0.667, -0.615, -1.664, 0.2, 0.9, 0.375, 0.084, 0.9, 0.841, -0.499, 0.9, -0.033, 0.608, 1.133, -1.256, -1.14, -0.091, 0.492, -0.207, 0.55, -1.373, -1.606, -0.674, 1.366, 0.492, 0.259, 1.657, -0.674, -0.324, -0.674, -2.13, -0.79, -1.373, -1.664, 0.958, 0.2, -0.324, 0.55, -1.723, 0.375, 1.599, -0.266, -0.091, 1.191, -0.79, -0.324, 0.667, 0.608, 0.725, 1.075, -0.149, 0.608, -0.674, -1.956, -0.965, -0.324, 0.492, -1.198, -1.315, 0.2, 0.841, 0.084, 0.667, 0.55, 0.317, -0.557, 1.774, 1.133, 0.667, -1.664, -0.907, -0.091, -2.014, 0.259, -0.732, 0.259, 0.725, -0.207, 0.317, -0.79, 0.608, 0.841, -0.907, 0.55, -0.091, -1.373, 1.89, 1.016, -1.14, -0.499)
fifaRaw <- c(fifaRaw, 0.783, 0.375, -0.907, 0.783, 1.308, 1.016, 1.075, 0.841, -0.091, -2.305, 0.259, 0.259, 1.075, 1.249, -0.557, 0.725, 1.075, -1.781, 0.142, 0.841, 0.084, 0.841, -0.324, 1.191, 0.841, -0.499, 0.958, -1.198, 1.133, 0.2, -1.082, 0.608, 0.608, -1.023, 0.375, -0.848, 0.725, -0.965, 0.317, -2.422, 0.084, -1.956, -1.606, 0.725, -1.664, 1.191, 0.725, 1.075, 1.191, 1.075, 0.55, 0.841, 1.075, -0.499, 1.308, -1.14, 1.249, -2.014, 0.667, 0.142, 0.667, 1.016, -1.548, -1.256, 0.259, 1.075, 0.492, 1.308, 0.142, -1.723, -0.674, 0.958, -0.207, 0.2, 0.434, -0.033, 0.142, -2.364, -0.848, 0.9, -0.324, 0.841, 1.716, -0.848, -1.14, -1.664, 1.89, -0.033, 0.725, -0.149, 0.55, 0.2, -0.907, 0.026, 0.142, 0.317, -0.033, 0.375, -0.674, -0.149, 0.492, -1.082, -1.198, 0.608, -0.324, 1.482, -0.965, 0.434, 1.075, -1.686, 0.133, -0.537, -1.207, 0.277, -1.351, 0.516, -1.159, 0.085, -0.25, -1.255, 0.324, -0.106, 0.564, -0.968, -0.585, 0.994, 0.324, 0.851, 0.324, -1.351, 0.899, -1.063, 0.037, 0.659, -1.542, -0.681, 0.803, 1.042, 1.138, 0.468, 0.277, 0.516, 0.277, -1.255, -1.063, -0.872, 1.138, 0.851, 1.186, 1.234, 1.329, -1.542, 0.899, 0.899, -0.728, -1.016, 0.803, -1.59, -0.25, 1.377, -0.537, -1.255, -1.542, 0.611, -1.111, 1.09, 0.468, -1.829, -1.255, 0.564, 1.425, -1.494, 0.994, 0.324, 0.851, -0.202, 0.803, -0.872, 0.277, -0.681, -0.728, -1.063, 1.664, 0.946, -1.063, 0.037, 0.516, 1.138, -0.92, 0.946, 0.516, 0.994, 0.277, -1.207, -0.681, 0.181, 0.372, 0.229, 1.234, 1.473, 0.42, 0.994, 0.564, 0.085, -0.968, 0.564, -1.255, 0.899, -1.063, -1.255, 0.564, 0.707, -0.011, 0.994, -0.058, 0.659, -0.25, 1.09, 0.899, -1.159, 0.994, 1.616, -1.638, 0.707, 0.994, 0.372, -1.063, -0.154, -1.159, 0.611, 0.755, -1.733, 0.946, -1.111, 0.085, 1.234, 0.803, 1.09, -1.446, 0.803, 0.468, 0.42, -0.776, -0.489, 0.707, -1.063, -1.686, 0.468, 0.803, 1.042, 0.946, 1.042, 1.138, -1.159, 0.899, 0.659, -1.207, 0.324, -0.633, -1.542, -0.489, 0.851, 1.138, 1.281, 0.181, -1.446, 1.186, 0.803, 1.186, 0.803, -1.016, 0.468, 0.946, -1.59, -0.728, -0.968, -1.207, -1.542, -1.111, 0.611, 0.133, 0.611, -1.159, 0.803, 0.372, -1.063, 0.707, -0.154, 0.803, -0.92, -1.829, -0.776, 0.994, 0.229, -1.398, 1.042, 0.42, -0.441, 0.372, 0.946, 0.564, -1.925, -1.494, 0.181, 0.899, -0.633, 0.659, 1.138, -1.829, -1.638, -0.824, 0.277, 0.851, -0.776, 0.516, -1.542, 0.899, -0.585, 1.09, -1.446, 1.281, 1.09, 0.803, -0.441, 0.085, 0.851, 0.707, 0.324, 0.372, 0.899, 0.085, 0.229, 0.899, 0.946, 0.899, -1.063, -1.063, -1.398, -1.207, 0.707, 0.994, -1.398, 1.186, -1.111, -0.776, -0.537, -1.781, 1.186, -0.824, -1.733, 0.468, -0.633, 0.516, 1.042, -0.872, 0.803, -1.59, 0.372, -1.063, -1.686, 1.09, 0.564, 0.803, 0.946, -1.063, 1.138, 0.133, -0.92, 0.372, -0.537, 0.516, -1.686, -0.346, 0.851, 0.611, 0.755, 0.946, 1.616, 0.659, -1.207, 1.808, -1.111, -1.303, 0.181, 0.803)
fifaRaw <- c(fifaRaw, -1.351, -1.398, -0.776, -1.063, 0.468, 1.377, -1.159, -0.633, -1.351, 0.181, 1.473, 0.899, -0.824, -1.638, 1.042, 1.616, 1.281, -1.111, -1.351, -1.925, -1.016, 1.138, 1.377, -1.255, -1.446, 0.516, 0.277, -1.59, 0.564, 0.755, -0.25, -0.489, 0.803, 0.946, -1.303, 0.611, 0.851, 0.899, 1.09, -1.255, -1.446, 1.042, 0.803, -0.824, 0.707, -1.398, -1.686, -1.494, 1.281, 0.372, -0.25, 1.616, -0.537, 0.324, -0.25, -1.351, -1.111, -1.351, -1.59, 0.564, 0.707, 0.707, 0.755, -1.59, 0.755, 1.281, 0.277, -0.489, 0.803, -1.303, -0.106, 0.899, 0.899, 0.516, 1.234, 0.324, 0.659, 0.229, -1.303, 0.037, -0.393, 1.09, -1.638, -1.59, 0.994, 1.138, 0.468, 0.946, 0.851, 0.372, -0.92, 1.76, 0.946, -1.733, 0.277, -1.59, -1.207, -1.59, -0.106, 0.229, 0.229, 1.09, -0.537, -1.159, -1.303, 0.707, 0.851, -1.063, 0.659, 0.468, -1.686, 0.516, 1.521, -0.585, -0.106, 0.946, 0.707, -0.92, 1.281, 1.09, 1.09, 1.138, 1.329, -0.011, -1.303, -0.728, 0.037, 0.707, 0.707, 0.707, 0.229, 1.09, -1.207, 0.707, 1.09, 0.372, 1.042, -1.063, 0.803, 0.994, -1.063, 0.707, -1.59, 1.569, 0.899, 0.564, 0.324, -0.872, -1.494, 0.755, -0.441, 0.611, -0.968, -0.393, -1.877, 0.659, -1.829, -1.542, 0.946, -1.303, 1.234, 0.899, 1.281, 1.234, 1.234, 0.564, 0.564, 1.281, 0.516, 0.611, -1.303, 0.851, -1.686, 0.659, 0.516, 0.564, -0.298, -0.968, -1.255, -1.686, 0.037, 0.707, -0.537, 0.133, -1.255, -0.011, 0.899, -1.398, 0.755, -0.154, -0.441, 0.181, -1.111, -1.686, 1.281, -1.542, 0.851, 0.468, -1.59, -1.207, -1.207, 2.047, -0.011, 0.611, -0.298, 0.659, 1.281, -1.303, -0.154, -0.872, -1.207, 0.946, 0.803, 0.516, -0.154, -0.489, -1.303, -1.207, 1.856, 0.229, 1.138, -0.25, 0.611, 0.707, 1.227, 0.436, 0.881, -1.738, -0.848, -1.688, -1.293, -0.453, 0.535, 0.584, -0.404, 0.881, 0.683, 0.239, 1.177, 0.14, 1.029, 1.573, 0.98, 0.14, -1.441, 0.337, -1.787, -0.008, 0.09, -1.886, 0.634, 0.733, 0.288, 0.683, -1.342, 0.337, 0.041, -1.243, -1.54, 1.326, 0.337, -0.008, -0.7, 0.584, -1.046, 0.436, -2.034, 0.535, 0.782, 1.128, 1.078, 0.584, -1.836, -0.107, 0.98, 0.535, 0.288, -1.639, 0.98, -1.688, 0.93, -0.404, -2.182, 1.276, 0.387, 0.337, 0.98, 0.634, 0.337, 0.387, 0.14, -1.293, -0.305, 0.683, 0.486, 0.634, 0.14, 1.474, 0.782, -1.738, 0.387, 0.535, 0.189, 1.326, -0.552, -1.194, 0.239, 0.584, -1.787, 0.634, -0.996, -0.404, -0.947, -0.255, -0.947, 0.782, 0.288, 0.337, 0.831, 1.523, -0.996, 1.276, -0.107, 0.387, 0.881, 0.831, 0.683, -1.243, 0.387, -1.243, 0.14, 0.337, -0.552, 0.733, -2.281, -1.243, 0.288, -2.182, -1.342, 0.831, 0.634, -0.206, 0.337, 0.782, -0.601, 0.93, -1.688, 0.733, -1.688, -1.293, -0.255, 0.98, 0.535, -1.738, 0.535, -1.046, -0.206, 0.189, 1.029, 0.239)
fifaRaw <- c(fifaRaw, 0.535, -2.133, -0.996, -0.058, 0.634, 0.041, 1.177, 0.683, 0.436, 0.436, 1.177, -1.836, 1.029, 0.337, 0.535, 0.93, -0.255, -1.145, 0.535, 1.029, 0.881, 1.128, 0.535, -1.54, 0.436, 0.93, -0.305, 0.14, -1.738, 0.831, 0.337, -1.738, -1.886, -1.738, 0.486, 1.078, 0.041, -1.639, -0.157, 0.683, 0.782, 0.683, 0.486, 0.189, -0.157, -2.034, 0.239, -0.354, 0.733, 0.831, 0.288, -0.7, 0.189, -0.157, -0.601, 1.177, -2.034, -1.688, -0.898, 0.634, 1.128, 0.041, 1.029, -2.133, -1.935, -0.008, 0.683, 0.584, 0.98, -0.404, 0.782, 0.98, 1.326, 0.634, 0.239, -0.947, 0.683, -0.157, 1.474, 0.387, 0.733, -1.293, -0.255, -0.157, 0.782, -0.799, 0.288, -0.7, 0.387, -1.095, 0.98, 0.486, 0.189, -2.232, 0.387, 0.535, 0.634, 0.436, -1.935, 0.634, 1.029, -2.232, -0.651, 0.387, -2.034, 0.041, 1.128, 0.535, 0.387, -0.206, -1.046, -0.206, -0.354, 0.436, 0.93, -0.157, -1.046, 0.14, -1.293, 0.337, -0.947, 0.881, -2.182, 0.634, -0.058, -1.392, -2.083, 0.14, 0.09, 0.387, 0.93, 0.881, 1.029, 0.436, 0.535, -1.243, 1.523, -1.935, 0.239, -0.601, 0.09, 0.535, 1.375, -1.886, -0.799, 1.177, -1.985, 1.326, 1.523, 1.029, 0.733, 0.782, 1.622, -2.133, 0.486, 0.98, 1.227, 0.288, -1.589, -2.133, 0.09, 0.782, 0.239, 0.535, -1.441, 1.82, 0.337, -2.232, 0.337, -1.194, 1.029, 1.474, -1.046, -0.749, 0.93, 0.239, -0.996, 1.523, 0.634, 0.387, 0.189, 0.683, 0.584, 0.239, -0.7, 0.683, -1.886, 0.782, -0.848, 1.573, 0.09, -0.058, 1.078, -1.342, 0.634, -1.836, 0.881, 0.288, -1.886, 0.337, 0.881, -1.441, 0.189, -2.182, -0.552, -0.848, 0.782, 0.881, 0.041, 0.486, -1.342, 0.14, 0.634, 0.584, 1.128, -0.058, 0.337, 1.029, -1.589, -0.354, 0.337, -0.601, -1.639, 0.189, 1.078, 0.486, -0.7, -0.305, 0.634, 0.041, 1.227, -0.404, 0.486, 0.831, 0.831, -1.441, 0.288, -1.886, 0.535, -0.058, 0.486, 1.177, 1.029, 0.189, 0.733, -1.342, -0.255, 0.189, 0.14, -1.145, -1.738, 0.93, -1.441, 1.078, -0.305, 0.041, -0.255, -1.935, -0.206, -0.157, 0.337, 0.436, 0.782, -0.404, -1.688, 1.424, 0.288, -0.058, 0.881, 0.733, 0.337, 1.128, -1.935, 0.436, 0.733, 0.733, 0.881, 1.177, -0.947, -0.404, 0.239, 0.683, 1.029, 0.93, 0.337, 0.584, 0.535, 1.029, -1.738, 0.634, 1.128, 0.782, 1.128, 0.09, -2.232, 0.387, -2.083, -0.255, 0.881, 0.98, 1.029, -1.293, 0.535, -0.404, 0.189, 0.288, -0.206, 0.881, -0.996, -0.157, 0.387, 1.177, -1.935, 0.239, 0.93, 0.436, 1.177, -1.589, -1.836, 0.337, -0.453, 0.337, 1.128, -1.194, -0.008, 0.782, 0.337, 1.227, -1.342, -0.749, 0.93, 0.387, -1.738, 0.436, 1.227, 0.584, 0.634, -0.947, -1.886, -1.787, -1.589, 0.535, 0.782, -0.898, 1.276, 0.09, 0.189, 0.733, -0.354, 1.276, 1.128, 0.782, -0.848, -1.54, -0.354, 1.177, 0.584, -1.738, 0.337, -0.354, 0.288, 1.029, 0.041, -1.095, 0.517, 0.517, 0.731, -1.907, 0.374, -0.838, -1.123, 0.16, 0.089, 1.016, 0.517)
fifaRaw <- c(fifaRaw, 0.232, 0.374, 0.588, 1.586, -1.693, 1.658, 1.301, 1.016, -0.41, -1.693, 0.588, 0.374, -1.693, -0.125, -0.41, 0.588, 0.731, 0.873, 1.515, -1.337, -0.766, -1.265, -1.479, 0.089, 0.659, -0.053, -0.909, -1.194, 0.873, -1.693, 0.802, -0.553, 1.016, 0.374, 0.731, -0.053, 1.158, -3.119, 0.945, 1.8, 0.446, 0.873, -1.622, 0.588, -0.624, 0.659, -1.337, -0.838, 0.873, 0.731, -0.125, 0.374, -0.053, 0.089, 0.945, 0.16, -1.551, 0.802, 0.232, 0.089, -0.339, 0.517, 2.299, 0.517, -1.052, 0.374, 1.016, 0.588, 0.16, -0.695, -1.622, -1.052, -1.194, 0.446, 0.659, -1.836, -0.481, -1.693, 0.374, 0.089, 0.089, -0.766, 1.23, 0.945, 1.301, -1.693, 1.301, -0.41, -0.909, 0.303, 1.087, 0.16, -1.265, -1.052, -0.41, -0.41, 0.018, -0.98, 1.444, -1.622, -1.693, 1.515, -1.479, -1.337, 0.446, 0.659, -0.909, 0.588, -0.196, 0.303, 1.016, -1.479, 1.158, -1.408, -1.693, -0.553, 1.586, 0.517, -1.907, 0.089, -1.622, -1.337, 0.089, 0.945, 0.16, 0.16, -2.905, -1.123, -1.265, 0.802, 0.588, 1.8, 0.945, -0.481, 0.018, 1.515, -0.98, 0.945, 0.588, -0.053, 1.444, -0.125, -0.909, 1.016, 0.517, 0.374, 1.158, 0.945, -2.263, 0.659, 0.303, -0.053, -0.053, 0.802, 0.802, 0.303, -0.267, -2.62, -0.267, 0.802, 0.945, -1.123, -1.836, 0.018, 0.446, 0.303, 1.087, -0.196, 0.588, 0.232, -1.551, 0.446, -0.695, 0.303, 0.232, 0.802, -1.622, 0.303, -0.481, -1.408, 1.158, -1.693, -0.766, -1.836, 1.087, -0.053, -1.052, 1.372, -0.125, 0.588, 0.16, 0.659, 0.303, 1.016, -0.053, 0.517, 1.301, 0.873, 1.515, 0.089, -0.339, -0.339, -0.481, 1.943, 0.873, 0.446, -1.194, -0.553, -0.624, 0.873, -1.052, 0.731, -1.123, 0.16, -1.408, 1.515, 0.517, 0.446, -1.337, -0.553, 0.232, 0.303, 1.444, 0.873, 0.802, 1.301, -1.907, -0.838, 0.232, -1.052, 0.588, 1.372, 0.802, 1.016, 0.446, -0.98, -0.267, -1.123, 0.588, 0.303, 1.087, -0.267, 0.374, -1.764, 0.16, -0.909, 1.158, -1.408, 0.945, -0.196, -1.551, -0.624, -0.41, 0.446, -0.339, 1.158, 1.016, 1.372, 0.374, -0.909, -1.907, 0.802, 0.089, 0.588, -2.121, -0.267, 0.802, 1.729, 0.16, -0.624, 1.158, -1.622, 1.016, 0.731, 0.945, 0.089, 1.087, 1.23, -0.838, 1.016, 1.301, 1.301, -0.053, 0.232, -1.764, -0.481, 1.087, 0.873, -0.624, -2.05, 1.943, -0.41, -1.978, 0.018, -1.337, 1.586, 2.37, -0.481, -0.053, -1.194, -1.551, -1.693, 1.586, 0.659, 0.089, -0.481, 0.089, 0.802, -0.053, -1.622, 1.016, -0.909, 0.018, -0.624, 1.586, -0.766, -0.553, 0.588, -1.194, 0.16, -1.265, 1.016, -0.196, -0.41, 0.303, 0.303, -1.265, 0.873, -1.479, 0.303, -1.337, 0.802, 1.515, -0.053, 1.372, -1.836, 0.303, 0.588, 0.731, 1.158, 0.089, 0.731, 1.444, -0.909, -0.196, 1.016, -0.624, -1.978, -0.41, 1.23, 0.018, -0.125, -1.408, 1.087, -0.695, 1.301, 0.16, 1.016, -0.553, 0.945, -2.05, -0.053, -0.125, 0.873, -0.553, 0.731, 0.731, 0.659, -0.481, 0.659, -1.265)
fifaRaw <- c(fifaRaw, 0.089, 0.446, -0.838, -0.339, -1.337, 1.586, -2.05, 1.087, -1.194, 0.303, -0.624, -1.408, -1.123, -0.339, -0.909, -0.339, 1.301, -0.909, -1.337, 0.731, 0.018, -0.267, 0.446, 1.016, 0.802, 1.087, -1.194, -1.123, 0.659, 0.945, 0.802, 0.018, -1.479, 0.089, -0.766, 0.446, 0.303, 1.444, 0.802, -0.196, 0.873, 0.517, -0.339, 0.374, 1.016, 0.374, 1.729, 0.588, -1.551, 0.945, -0.766, -0.267, 0.873, -0.553, 1.372, -1.337, 0.873, -0.41, -1.337, -0.267, 0.16, 1.016, -1.337, -0.196, -0.053, 1.158, -1.408, -0.196, -0.553, 0.303, 0.802, -0.766, -0.267, -0.553, -0.41, -0.125, 0.374, -0.624, -0.41, 0.731, 0.232, 0.374, -1.052, -0.267, 0.374, 0.802, 0.731, -0.053, 0.945, -0.695, 0.446, -1.194, -0.339, -1.123, -1.693, 0.659, 1.301, -1.265, 0.659, 0.16, 1.23, 0.446, 0.089, 1.016, 1.586, 0.446, -0.196, -1.907, -0.98, 0.16, 0.303, -1.265, 0.303, -0.053, 0.945, 1.871, -0.766, -1.836, 1.026, 0.712, 0.712, -1.606, 0.086, -1.042, -0.854, 0.211, -0.165, 0.023, -0.039, 0.838, -0.29, 0.587, 0.336, 0.462, 2.154, 0.838, 0.838, -1.042, -1.543, -0.603, -1.167, -0.729, -0.729, -1.543, 1.088, -1.105, 1.339, 0.712, -0.353, -0.165, -0.165, -0.353, -1.731, 0.336, -0.541, -1.23, -0.917, 0.587, -0.478, 0.274, -0.791, -0.165, -0.039, 0.399, 0.712, -0.165, -0.979, 0.399, 1.527, 0.086, -0.165, -1.857, 0.65, -1.105, 0.023, -0.729, -0.729, 1.339, -0.039, -0.29, 1.715, 0.9, 0.211, 0.023, -0.666, -0.541, 0.274, 1.214, 1.026, 0.65, 0.524, 1.59, 0.462, -2.233, 0.274, 0.336, -0.102, 1.088, -0.165, -0.541, -0.227, -0.478, -1.481, 1.402, -0.854, -0.666, -0.854, 1.402, -0.039, -0.227, -0.039, 0.462, 1.214, 1.652, -0.478, 1.59, -0.165, 0.65, 0.9, -0.29, -0.039, -1.418, -1.293, -0.791, -0.541, 0.524, -0.729, 1.778, -2.358, -0.666, 0.462, -2.358, -0.791, -0.165, 0.712, 0.838, 1.026, 0.838, 0.712, 1.026, -1.355, 0.775, -2.045, -0.979, 0.023, 0.963, -0.165, -1.606, -0.729, 0.211, -0.039, 0.336, 0.963, -0.165, 0.399, -2.233, -0.666, -0.603, -0.227, -0.039, 0.336, 0.838, 0.65, -0.478, 0.9, -1.481, 0.838, 0.838, 0.775, 1.339, -0.227, -1.167, 0.023, 0.775, 0.399, 1.652, 0.211, -1.105, -0.165, 0.399, -0.415, -0.603, -1.606, 0.838, 1.402, -1.418, -1.543, -2.107, 0.65, 0.65, -0.478, -1.543, -0.791, 0.462, 0.086, -0.353, 0.086, 0.274, -0.227, -2.107, -0.039, -0.165, -0.353, 1.276, 1.276, -0.227, -0.478, 0.524, -0.979, 1.088, -2.233, -1.919, -0.478, 0.65, 1.402, -0.29, -0.478, -2.17, -1.418, 0.086, 0.587, -0.353, 1.339, 0.149, 0.963, 1.339, 1.088, 1.214, 1.214, -0.415, -0.165, 0.211, 1.966, 0.023, -1.042, -0.791, -0.541, -0.854, 0.838, -0.791, 0.524, 1.715, 1.84, -0.729, 0.838, 1.151, 0.149, -2.045, -0.415, -0.603, 1.151, 0.775, -1.794, 0.399, 1.464, -2.17, 0.086, 0.462, -1.355, -0.227, 1.402, -0.29, -0.039, 0.775, 0.274, 0.9, -0.729, 0.399, 0.211, -0.102, -1.105, 0.399, -0.039, 0.274, -0.478, 1.527, -2.233, -0.353, -0.102)
fifaRaw <- c(fifaRaw, -0.102, -1.982, 0.149, -0.102, -0.791, 0.65, 0.462, -0.039, 0.149, 0.462, -0.603, 1.088, -1.982, -0.102, -0.979, 0.524, 0.524, 1.276, -1.355, -0.791, 1.151, -1.481, 1.088, 1.778, 1.402, -0.791, 1.464, 2.216, -1.857, 0.587, 0.9, 1.84, 0.712, -2.233, -1.857, 1.214, 0.023, 0.023, 0.775, -1.669, 1.464, 0.149, -1.794, 0.023, -0.102, 1.966, 1.652, -0.729, -0.415, 0.399, -0.353, 0.086, 1.339, 0.023, -0.102, -0.102, -1.105, 1.026, 0.274, -0.666, 0.838, -1.105, 0.023, -0.227, 1.464, -0.039, -0.541, 1.402, -0.478, 0.9, -1.481, 0.211, 1.214, -1.857, -0.227, -0.478, -0.854, 0.587, -2.233, 0.65, 0.023, 0.775, 0.524, -0.917, 1.276, -0.854, -0.165, -1.293, -0.541, 0.712, -0.603, -0.165, 1.151, -1.418, -0.603, -0.165, -0.165, -1.606, 0.712, 1.464, 1.339, -0.227, -0.478, 1.464, -0.165, 0.462, 0.211, 1.214, 1.088, 1.339, -1.669, 0.9, -2.358, 0.023, -0.854, 0.211, 0.211, 1.276, -0.666, 1.214, -0.854, -0.415, -0.102, 0.524, -0.227, -1.543, 2.028, -0.917, 0.838, -0.917, -0.478, -0.478, -0.165, 0.023, 0.336, -0.603, -0.29, 0.211, -1.042, -1.543, 0.462, 0.211, -0.666, 0.023, 1.214, 0.65, 1.715, -1.731, -0.791, -0.415, 0.838, 0.399, 0.587, -0.478, 0.149, 0.9, 0.587, 1.026, 2.216, 0.336, 0.65, 1.088, 0.775, -1.982, -0.165, 1.088, 0.023, 1.026, 0.211, -0.415, -0.791, -1.418, 1.026, 1.652, 1.214, 0.211, -0.666, 0.712, -0.227, 0.838, -0.039, -0.29, -0.227, -0.165, -0.415, -0.541, 0.963, -1.418, -0.478, 0.462, -0.165, 0.963, -2.233, -1.481, 0.775, -1.23, 0.211, 1.402, -0.666, 0.65, 1.088, -0.165, 1.214, -0.165, -0.29, 1.276, -0.353, -1.669, 1.088, 0.336, -0.039, 0.524, -0.729, -1.042, -1.982, -2.045, 1.715, 1.652, -0.541, 0.963, -1.919, 1.402, -0.039, 0.149, 2.028, 1.088, -0.478, -1.105, -0.603, -0.478, 1.026, 0.462, -2.421, 1.276, -0.541, 0.086, 1.402, -0.478, -0.979, 0.801, 0.31, -0.673, -1.573, -0.918, 0.064, -0.263, 0.392, 0.31, 0.392, 0.146, 0.555, -0.263, 0.31, 1.292, -0.673, 1.702, 2.029, 0.555, -0.591, -1.655, 0.637, 0.064, -1.901, 0.064, -0.345, 0.637, 0.473, 1.128, 1.702, -0.837, -1.328, -0.345, -0.918, -0.837, 1.128, -0.018, 0.637, 0.228, 1.128, 0.883, 0.064, 0.555, 0.146, 0.637, 0.637, 0.473, 0.883, -0.182, -0.345, 0.392, -0.837, -0.018, -1.41, 0.473, -1.819, 0.473, -0.427, -1.655, 0.31, 0.473, 0.555, 0.31, 0.473, 0.064, -0.1, -0.018, -1.082, 0.146, 0.146, -0.182, 0.392, -0.018, 2.193, 0.146, -0.509, 0.473, -0.918, 0.965, 1.047, -0.263, 0.473, -0.182, -0.182, 0.064, 0.228, -1.328, -1.655, -1.41, 1.128, 1.21, 0.637, 0.555, 0.31, 0.146, 1.783, -1.164, 1.538, -0.345, 0.31, 0.146, -0.509, 0.228, -0.837, -0.427, -0.591, -0.1, 0.146, -0.345, 1.292, -2.638, 0.637, 1.538, -2.965, -0.918, 1.128, 0.555, -1.655, -0.1, 0.228, 0.719, 1.047, -1, 0.473, -1.082, -1.328, 0.883, 0.719, 1.538, -1.082, 0.392)
fifaRaw <- c(fifaRaw, -0.918, -0.427, -0.755, 1.865, 0.31, -0.263, -3.539, -1.819, 0.228, 0.801, 0.473, 1.947, 0.801, -1.328, -0.345, 0.883, -0.182, 0.883, -0.1, -1, 1.456, -0.182, -0.427, 1.62, 0.064, 0.801, 1.374, 0.555, 0.473, 0.228, 0.637, -0.263, 0.637, 0.31, 1.047, -0.755, -0.345, -3.702, -0.182, 0.555, 1.21, -1.246, -0.263, -0.837, 0.31, 0.31, -0.018, -0.591, 0.392, -0.182, -2.228, -0.755, 0.555, 0.228, -0.345, 0.555, -0.182, -1.737, -1.328, -0.018, 1.128, -1.737, -2.147, -1.246, 0.228, 0.392, -0.509, -0.1, -0.182, -0.591, -0.509, 0.31, 0.228, 1.374, -1.164, 0.146, 1.538, 0.965, 1.374, -0.673, 1.128, 0.31, 0.555, 1.374, 0.392, 0.555, -0.018, -1.41, -0.755, 0.637, -1.164, 0.064, -1.082, 1.538, -1.082, 0.31, 0.555, -0.018, -2.147, 0.31, 0.883, 0.31, 1.292, -0.1, 0.31, 1.292, -2.31, 0.555, -0.182, -0.263, -0.1, 1.047, -0.018, 0.228, -0.263, 1.047, -0.918, -1.655, -0.1, 0.473, 0.883, -0.755, 0.392, -0.1, -1.41, 0.883, 0.965, -2.147, 0.146, -1.328, -0.673, -2.147, -1.41, -0.263, 0.146, 0.392, 0.637, 1.62, -0.182, -0.427, 1.21, 1.292, -1.901, 0.31, -0.427, -1, 0.965, 1.538, -0.1, -0.182, 1.783, -0.673, 1.128, 0.637, 0.801, 1.702, 0.146, 1.702, -2.802, 0.31, 2.111, 1.783, -0.755, 0.31, -1.573, -1.246, 0.965, 1.128, -0.591, -1.655, 1.292, -0.755, -2.802, -0.427, -0.755, 0.637, 1.947, 0.228, -0.345, 1.21, 0.228, -0.018, 1.62, 0.719, -1.082, -1.246, 0.228, 0.392, -0.673, -0.509, 0.31, -1, 0.637, 1.292, 0.883, -1.41, 0.883, 0.719, -1, 0.228, -1.983, 0.473, -0.263, -2.638, -0.427, -0.673, -0.263, 0.637, -2.556, 0.473, -0.1, 0.883, 0.473, -0.182, 1.128, -0.673, 0.228, 0.31, -0.591, 1.62, -1.328, 0.392, 0.064, -0.837, -1.655, -0.1, 0.064, -0.182, -1.082, 0.883, 0.637, -0.755, 0.801, 0.31, -1.082, 1.047, 1.374, 1.047, 0.719, 0.064, -0.755, -0.755, -0.918, 0.555, -1.41, -0.755, 0.392, 0.801, -0.755, -0.509, -1.246, 1.128, -0.427, -1.082, -1.492, -2.802, 0.31, 1.21, 1.21, -1.41, 0.555, -0.1, -1.082, 0.146, 0.719, -0.263, 0.473, 0.473, -1.246, -2.392, 1.292, -0.1, -0.673, 0.801, 0.801, 0.392, 0.719, -2.883, 0.555, 0.31, -0.182, 0.883, 0.228, 0.555, 0.719, -0.837, 0.473, -0.755, 1.783, 0.637, -0.182, 0.473, 0.965, -1, -0.018, 0.719, 0.801, 1.374, -0.263, -1.655, 0.064, -0.018, -1.246, 1.047, 0.392, 1.456, 0.146, 0.555, 1.21, 0.555, -0.509, -0.345, 1.456, -1.082, 0.228, -0.427, 1.047, -0.673, -0.1, 0.555, 0.555, 1.047, 0.31, 0.555, -0.918, 0.392, -0.673, 0.883, -1.164, -0.591, 0.146, 0.392, -0.427, -0.591, -1.082, -0.1, 0.146, 0.801, -0.263, 0.801, -1.082, 0.392, -0.1, -0.345, -0.918, -2.72, 1.947, 0.473, -0.018, 0.883, 0.146, -0.018, -0.673, -0.018, 1.62, 1.456, -1.082, 0.228, -0.345, -0.755, 0.555, 0.555, -1.655, 1.128, -0.837, 0.965, 1.783, -0.182, -1.164, 0.232, -0.472, -0.522, -1.428, 0.232, -1.377, 0.836)
fifaRaw <- c(fifaRaw, -0.774, 0.283, 0.836, -0.22, -0.371, 0.484, 0.685, -0.874, -0.824, 0.937, 0.081, 0.937, 0.383, -1.88, 0.031, -1.981, -0.422, 0.534, -1.73, -0.623, 0.635, 0.635, 1.138, 0.383, 0.383, -0.12, 0.685, -1.88, -0.522, -1.428, 1.289, 0.987, 1.238, 1.289, 0.635, -1.528, 0.635, 1.037, -0.925, -1.025, 0.635, -1.78, -0.874, 1.138, -0.422, -0.371, -1.78, 0.534, -1.629, 1.037, 0.434, -1.78, -0.874, 0.283, 1.49, -0.723, 1.138, 0.434, -0.623, -0.573, 0.786, -0.371, 0.484, -1.227, -0.723, 0.031, 1.238, 1.037, -1.88, -0.17, 0.735, 1.037, 0.333, 1.238, 0.735, 0.735, -0.673, -1.478, -0.774, 0.031, 0.434, 0.484, 1.238, 1.289, 0.283, 0.635, 0.132, -0.573, -0.522, 0.434, 0.735, 1.188, -1.478, -1.176, 0.635, 0.584, -0.17, 0.584, 0.333, 0.081, 0.182, 1.087, 0.735, -1.629, 1.188, 1.037, -1.981, 0.735, 0.735, -0.12, -0.673, -1.579, -0.673, 0.685, 0.283, -0.975, 1.037, -1.277, 0.182, 1.138, 0.937, 1.44, -1.83, 0.232, 0.584, 0.735, -0.472, -0.623, 0.182, -1.377, -1.981, -0.371, 1.037, 1.037, 0.735, -0.019, 0.735, -0.774, 0.836, 0.534, -1.73, -0.422, -0.422, -0.925, -1.377, 0.987, 1.138, 1.44, 0.534, -1.176, 1.339, 0.735, 1.238, 0.685, -0.925, 0.132, 1.138, -1.528, 0.383, 0.031, -1.377, -1.78, -1.126, 0.836, -0.975, 0.333, -1.931, 0.685, 0.685, -0.673, 0.283, -0.774, 0.735, -1.327, -1.377, -0.623, 1.087, 0.232, -1.327, 0.685, 0.735, -0.774, 0.534, 1.138, -0.371, -2.082, -1.629, 0.383, 0.383, -0.12, 0.786, -0.22, -1.327, -1.327, -0.371, 0.534, 0.635, -0.925, 0.484, -1.73, 0.383, -0.271, 1.238, -1.377, 1.087, 1.087, 0.685, -1.126, -0.371, 0.685, 0.987, 0.584, 0.333, 0.987, 0.232, -0.12, 0.886, 1.339, 0.886, 0.182, -0.925, -0.975, -2.132, 0.836, 1.087, -0.573, 1.641, -1.73, 0.081, 0.434, -1.679, 1.087, -0.874, -1.83, 0.031, -0.774, 0.987, 1.037, 0.383, 0.786, -1.126, 0.182, -0.522, -1.377, 1.138, 0.836, 0.937, 1.138, -0.321, 1.289, 0.232, -1.931, 0.786, -0.17, 0.031, -1.78, -0.975, 0.937, 0.635, 0.484, 0.534, 1.641, 0.635, -1.126, 2.043, -0.522, -1.629, 0.232, 0.584, -1.428, -0.673, 0.283, -1.277, 0.685, 1.842, -1.528, -1.227, -1.277, 0.534, 1.289, 0.886, 0.584, -1.981, 0.735, 1.389, 0.987, -0.12, -1.227, -1.83, -0.472, 1.238, 1.037, -0.623, -1.428, 0.383, 0.534, -2.082, 0.383, 0.836, 0.534, -0.774, 0.685, 0.886, -0.925, 0.937, 0.836, 0.434, 1.037, -0.573, -0.925, 0.836, 0.635, -1.428, 0.584, -1.277, -1.78, -0.12, 1.339, -0.774, -0.371, 1.389, -1.126, 0.584, -0.975, -1.679, 0.685, -0.925, -1.377, 0.333, 0.584, 0.383, 0.031, -1.78, 0.937, 1.238, -0.12, -0.673, 1.087, -1.428, -0.07, 0.886, 1.289, -0.07, 1.238, 0.283, 0.836, 0.031, -1.88, -0.22, -0.07, 0.987, -1.88, -1.025, 0.534, 1.238, 0.333, 0.886, 0.836, 1.087, -0.019, 1.641, 0.937, -1.629, -0.07, -1.78, 0.383, -1.88, -0.12, 0.132, 0.333, 1.238, -0.824, -0.371)
fifaRaw <- c(fifaRaw, -0.422, 0.735, 0.937, -0.522, 0.886, 0.383, -1.629, 0.685, 1.44, -1.327, 0.383, 1.087, 0.735, -1.83, 1.49, 0.786, 0.937, 1.389, 0.886, 0.333, -1.428, 0.534, -0.371, 0.534, 0.635, 0.031, 0.484, 1.238, -1.377, 0.735, 1.087, -0.12, 0.786, -0.874, 0.987, 1.238, -0.975, 0.836, -0.12, 1.641, 0.635, 0.635, 0.383, -0.623, -1.579, 0.635, 0.132, 1.037, -0.774, -0.07, -1.931, 0.383, -1.83, -1.176, 0.886, -1.327, 1.087, 0.635, 1.339, 1.339, 0.886, 0.836, 0.534, 1.238, 0.484, 0.685, -1.327, 0.584, -1.629, 0.987, -0.422, 0.786, -0.321, -1.277, -1.83, -1.78, 0.383, 0.232, -0.472, -0.522, -1.277, -0.321, 1.188, -0.422, 0.635, 0.635, -0.371, 0.081, -1.478, -1.478, 1.289, -1.025, 0.836, 0.081, -1.83, -1.78, -1.428, 1.993, -0.874, 0.484, -0.019, 0.735, 0.635, -1.428, -0.17, -1.227, -1.126, 0.534, 0.735, 0.534, 0.081, -0.573, -0.22, -1.679, 1.138, 0.534, 1.188, 0.132, 0.434, 0.937, -1.551, -0.023, -1.506, -1.281, 0.382, -1.641, 1.011, -0.697, 0.292, 0.337, -0.292, -0.068, -0.337, 0.786, -0.517, -1.641, 0.921, 0.292, 0.427, 0.337, -1.371, 0.696, -1.461, 0.247, 0.651, -1.551, -0.292, 0.966, 0.831, 1.011, 0.606, 0.696, 0.786, 0.831, -1.551, -0.922, -1.012, 1.281, 0.966, 1.146, 1.236, 0.741, -1.416, 0.292, 0.921, -1.057, -0.832, 0.786, -1.326, -0.248, 1.325, -0.742, -0.292, -1.461, 0.606, -1.596, 1.056, 0.786, -1.686, -1.191, 0.516, 1.325, -1.551, 1.056, 0.292, 0.786, 0.067, 1.281, 0.067, 0.831, -0.967, -0.292, -0.787, 1.415, 0.921, -1.641, -0.337, 0.337, 0.921, -0.967, 0.921, 0.831, 0.831, 0.247, -1.371, -0.697, 0.472, 0.561, 0.516, 1.191, 1.236, 0.651, 0.561, 0.921, 0.067, -0.967, 0.831, -1.236, 0.876, -0.877, -1.461, 0.651, 0.786, 0.472, 0.831, 0.741, 0.561, -0.113, 0.966, 0.696, -1.731, 1.146, 1.325, -1.551, 0.876, 1.056, 0.247, -0.742, -1.101, -1.506, 0.741, 0.786, -1.506, 0.741, -1.641, 0.472, 1.146, 0.651, 1.191, -1.461, 0.696, 0.247, 0.741, -1.326, -0.877, 0.786, -0.922, -1.371, 0.382, 0.921, 1.101, 1.101, 0.561, 0.966, -0.877, 1.236, 0.921, -1.461, -0.472, -0.337, -1.506, -0.517, 1.011, 1.011, 1.236, -1.236, -1.101, 1.146, 0.696, 1.281, 0.696, -1.146, 0.831, 0.876, -1.596, -0.742, -1.012, -1.416, -1.596, -1.506, 0.696, -0.517, 0.876, -1.686, 0.741, 0.292, -1.236, 0.292, -0.607, 0.696, -0.877, -1.641, -0.697, 1.101, -0.472, -1.191, 0.876, 0.696, -0.517, 0.292, 1.011, 0.202, -1.596, -1.461, 0.741, 0.921, -1.101, 0.741, 1.011, -1.461, -1.461, -0.517, 0.516, 0.831, -0.697, 0.427, -1.596, 1.011, -0.832, 0.786, -1.146, 1.101, 1.146, 0.606, -1.057, 0.112, 0.876, 0.876, 0.472, 0.516, 0.966, 0.382, 0.292, 0.786, 1.325, 0.921, 0.022, -1.326, -1.281, -1.551, 0.561, 1.056, -1.461, 1.37, -1.236, -0.248, -0.158, -1.506, 1.146, -0.158, -1.506, 0.382, 0.112, 0.651, 1.011, -0.517, 0.921, -1.641, 0.696)
fifaRaw <- c(fifaRaw, -0.742, -1.506, 1.101, 1.011, 0.921, 0.966, -0.203, 1.101, 0.247, -1.596, 0.427, -1.146, 0.696, -1.551, 0.022, 0.696, 0.786, 0.561, 0.651, 1.46, 0.696, -1.461, 1.73, -1.146, -1.506, 0.516, 0.786, -1.461, -1.326, -0.382, -1.641, 0.561, 0.876, -1.641, -0.967, -0.472, 0.157, 1.46, 0.966, -0.832, -1.641, 1.056, 1.191, 1.191, -1.012, -1.551, -1.506, -0.922, 1.281, 1.325, -0.967, -1.461, 0.651, 0.472, -1.641, 0.651, 0.921, -0.697, -0.517, 1.011, 0.831, -1.326, 0.921, 0.696, 0.651, 1.011, -0.877, -1.686, 0.966, 0.786, -1.057, 0.606, -1.191, -1.641, 0.516, 1.281, -0.158, -0.292, 1.46, -1.146, 0.741, -1.416, -1.416, -0.967, -0.832, -1.551, 0.831, 0.606, 0.786, 0.561, -1.641, 0.831, 1.056, 0.292, -0.877, 1.191, -1.191, 0.472, 0.786, 1.101, 0.561, 1.056, 0.382, 0.561, -0.472, -1.506, -0.158, 0.337, 0.786, -1.641, -1.371, 0.831, 1.146, 0.561, 1.146, 0.472, 0.382, -1.281, 1.64, 0.741, -1.236, -0.292, -1.416, -0.967, -1.596, -0.068, 0.247, 0.247, 1.011, -0.697, -0.742, -1.146, 0.651, 0.786, -0.248, 0.606, 0.472, -1.551, 0.516, 1.37, -0.787, 0.202, 1.281, 0.741, -1.596, 1.37, 1.146, 0.606, 1.415, 1.236, 0.022, -1.416, -0.697, 0.022, 0.651, 0.831, 0.337, 0.247, 1.325, -1.506, 0.651, 0.876, 0.651, 0.651, -1.551, 1.056, 1.191, -1.012, 0.651, -1.326, 1.37, 0.831, 0.651, -0.607, -0.652, -1.461, 0.472, -0.113, 0.741, -0.248, -0.158, -1.506, 0.651, -1.686, -1.236, 0.876, -1.101, 1.191, 0.741, 1.146, 1.011, 0.786, 0.831, 0.651, 1.46, 0.606, 0.606, -1.281, 0.921, -1.506, 0.831, -1.506, 0.966, -0.742, -1.146, -1.461, -1.326, 0.292, 0.651, -0.562, -0.292, -1.461, -0.652, 0.651, -1.101, 0.876, -0.248, -0.113, 0.112, -1.596, -1.012, 1.236, -0.922, 0.831, 0.516, -1.012, -1.461, -1.506, 2, -0.248, 0.606, -0.697, 0.651, 0.561, -1.236, -0.203, -1.146, -1.057, 0.921, 1.056, 0.921, 0.337, -0.562, -1.371, -1.686, 1.37, -0.248, 1.011, -0.113, 0.696, 0.516, -1.516, 0.053, -1.331, -1.377, 0.837, -1.562, 0.976, 0.099, 0.376, 0.56, 0.145, -0.685, -0.731, 0.699, -0.593, -1.608, 0.929, -0.316, 0.237, 0.237, -1.47, 0.791, -1.377, 0.053, 0.837, -1.423, 0.791, 1.022, 0.653, 0.791, 0.468, 0.745, 0.883, 0.699, -1.285, -0.962, -1.1, 1.206, 0.883, 0.976, 1.252, 0.422, -1.562, 0.883, 1.114, -1.1, -1.1, 0.653, -1.562, -0.778, 1.483, -0.778, 0.007, -1.562, 0.33, -1.377, 1.252, 0.745, -1.654, -1.562, 0.653, 1.621, -1.331, 1.16, -0.039, 0.33, -0.178, 0.976, -0.501, 0.745, -0.962, -0.501, -1.054, 1.068, 0.791, -1.562, -0.039, 0.468, 1.022, -1.562, 0.883, 0.837, 1.022, 0.468, -1.47, -0.731, 0.237, 0.33, 0.468, 1.206, 1.252, 0.653, 0.837, 0.929, 0.191, -1.193, 0.883, -1.147, 0.929, -0.916, -1.285, 0.653, 0.791, 0.468, 0.976, 0.468, 0.56, -0.178, 0.929, 0.284, -1.516, 1.022, 1.298, -1.47, 0.699, 1.252, 0.007, -1.147, -1.285)
fifaRaw <- c(fifaRaw, -1.423, 0.745, 0.653, -1.516, 0.791, -1.562, 0.284, 1.206, 0.422, 1.16, -1.239, 0.606, 0.284, 0.929, -1.516, -1.008, 0.699, -0.778, -1.377, 0.376, 0.929, 1.114, 0.791, 0.376, 1.022, -1.008, 1.252, 0.976, -1.608, -0.455, -0.316, -1.147, -1.147, 1.022, 0.976, 0.837, -1.054, -0.962, 0.883, 0.653, 1.298, 0.745, -1.193, 0.883, 0.929, -1.562, -0.731, -0.87, -1.47, -1.47, -1.562, 0.653, -0.547, 0.791, -1.654, 0.791, 0.284, -1.285, 0.284, -0.824, 0.745, -0.316, -1.47, -0.593, 1.114, -0.501, -1.47, 1.068, 0.745, -0.224, 0.284, 1.068, 0.237, -1.47, -1.331, 0.791, 0.837, -1.1, 0.883, 0.33, -1.516, -1.423, -0.27, 0.237, 0.791, -0.87, 0.468, -1.516, 0.883, -0.824, 0.791, -1.331, 1.068, 1.391, 0.699, -0.916, 0.099, 1.022, 0.929, 0.653, 0.56, 0.929, 0.56, -0.132, 0.745, 1.345, 0.976, -0.362, -1.377, -1.193, -1.47, 0.699, 1.16, -1.193, 1.298, -1.516, -0.039, -0.178, -1.562, 1.252, -0.27, -1.47, 0.237, 0.33, 0.422, 0.837, 0.145, 1.114, -1.516, 0.653, -0.731, -1.562, 1.068, 1.068, 0.976, 0.837, -0.962, 1.114, -0.316, -1.562, 0.33, -0.962, 0.514, -1.47, -0.132, 0.468, 0.791, 0.376, 0.422, 1.391, 0.745, -1.331, 1.898, -1.193, -1.654, 0.422, 0.745, -1.377, -1.423, -0.639, -1.608, 0.653, 0.699, -1.562, -1.054, -0.87, 0.007, 1.483, 0.883, -1.1, -1.47, 0.653, 0.929, 1.206, -0.962, -1.331, -1.654, -1.331, 1.252, 1.206, -0.87, -1.239, -0.178, 0.376, -1.654, 0.745, 0.883, -0.916, -0.316, 0.929, 0.929, -1.608, 0.837, 0.745, 0.929, 0.976, -0.778, -1.147, 1.16, 0.653, -1.1, 0.653, -1.377, -1.285, 0.653, 1.16, -0.501, 0.053, 1.714, -0.87, 0.56, -1.239, -1.423, -1.054, -0.639, -1.608, 0.791, 0.976, 0.883, 0.468, -1.516, 0.883, 1.252, 0.237, -0.408, 0.699, -1.377, 0.237, 0.929, 1.114, 0.376, 1.252, 0.237, 0.56, -0.824, -1.423, -0.178, 0.376, 0.929, -1.608, -1.285, 0.883, 1.16, 0.468, 1.114, 0.468, 0.653, -0.87, 1.714, 0.606, -1.377, -0.316, -1.193, -0.87, -1.516, -0.178, 0.376, 0.237, 1.206, -0.685, -0.778, -1.147, 0.883, 0.791, 0.053, 0.699, 0.514, -1.193, 0.376, 1.298, -0.962, 0.284, 1.114, 0.791, -1.654, 1.391, 1.114, 1.391, 1.76, 0.606, 0.33, -1.47, -0.824, 0.56, 0.791, 0.929, 0.33, 0.237, 1.345, -1.423, 0.606, 1.022, 0.606, 0.883, -1.285, 0.883, 1.252, -1.193, 0.883, -1.193, 1.483, 0.883, 0.699, 0.284, -0.639, -1.377, 0.33, -0.455, 0.929, -0.547, 0.099, -1.654, 0.699, -1.562, -1.377, 0.883, -1.193, 1.114, 0.837, 1.298, 1.022, 0.929, 0.745, 0.653, 0.883, 0.56, 0.653, -0.962, 0.837, -1.562, 0.837, -1.285, 0.976, -1.147, -1.423, -1.239, -1.331, 0.791, 0.883, -0.547, 0.007, -1.423, -0.962, 0.699, -1.377, 0.837, 0.422, -0.962, 0.099, -1.47, -1.008, 1.298, -1.054, 0.791, 0.699, -0.962, -1.608, -1.193, 2.083, -0.132, 0.56, 0.053, 0.699, 0.606, -1.008, -0.039, -1.239, -1.1, 0.791, 0.699, 1.022, 0.191, -0.593, -1.516, -1.423)
fifaRaw <- c(fifaRaw, 1.391, 0.053, 1.022, 0.284, 0.56, 0.745, -0.156, -0.523, -0.418, 2.156, -0.261, 3.364, -0.628, -0.576, -0.156, -0.523, -0.628, -0.471, -0.366, -0.261, -0.313, -0.576, -0.156, -0.576, -0.471, -0.208, 1.998, -0.366, 2.366, -0.418, -0.366, 2.313, -0.208, -0.628, -0.471, -0.313, -0.471, -0.523, -0.261, -0.156, 3.206, -0.471, -0.366, -0.366, -0.208, -0.471, -0.313, -0.261, 2.523, -0.313, -0.471, -0.313, -0.576, -0.523, 2.628, -0.681, -0.156, -0.523, -0.366, 2.996, -0.418, 2.838, -0.261, -0.523, 1.893, -0.471, -0.366, -0.156, -0.418, -0.313, -0.208, -0.681, -0.628, -0.313, -0.366, -0.156, -0.628, -0.418, -0.208, -0.418, -0.156, 2.733, -0.628, -0.156, -0.208, -0.156, -0.418, -0.576, -0.576, -0.313, 3.101, -0.103, -0.103, -0.261, -0.261, -0.366, -0.576, -0.261, -0.261, -0.366, -0.156, -0.261, -0.418, -0.418, -0.261, -0.261, -0.418, -0.628, -0.208, -0.208, -0.208, -0.156, -0.576, -0.261, -0.366, -0.576, 2.051, -0.156, -0.576, 1.946, -0.261, -0.418, -0.523, -0.261, -0.523, -0.156, -0.208, -0.471, 2.628, -0.471, 2.523, -0.523, -0.103, -0.576, -0.156, 2.313, -0.523, -0.681, -0.523, -0.471, -0.261, -0.103, -0.208, 2.103, -0.366, -0.208, -0.628, -0.471, -0.313, -0.103, -0.261, -0.261, -0.628, 3.101, -0.471, -0.366, -0.313, -0.576, -0.156, -0.313, -0.576, -0.156, -0.261, -0.103, -0.523, -0.208, -0.576, -0.628, -0.523, -0.366, 2.681, -0.471, -0.576, 2.733, 2.628, 2.523, -0.576, -0.366, -0.261, 2.418, -0.418, -0.418, -0.628, -0.523, -0.418, -0.156, -0.418, 1.42, -0.628, -0.366, -0.156, -0.208, -0.628, -0.103, -0.261, -0.156, -0.576, -0.208, 1.525, 2.366, -0.628, -0.418, -0.576, -0.208, -0.261, 2.576, 2.733, -0.628, -0.628, -0.208, -0.471, -0.681, -0.576, -0.366, -0.103, -0.523, -0.208, -0.628, -0.576, -0.261, -0.208, -0.523, -0.261, -0.313, -0.366, -0.313, -0.471, -0.208, -0.576, -0.576, -0.471, -0.418, -0.576, -0.313, -0.261, 2.261, -0.156, -0.628, -0.418, -0.313, 2.733, -0.576, -0.576, 2.313, -0.471, -0.576, 2.576, -0.418, -0.208, -0.576, -0.103, -0.208, -0.261, -0.208, -0.366, -0.156, -0.523, -0.208, -0.261, -0.208, -0.576, -0.261, -0.628, -0.208, 1.735, -0.471, -0.208, -0.628, 2.418, -0.523, -0.471, -0.418, -0.418, -0.103, -0.523, -0.418, -0.523, -0.471, -0.156, 3.154, -0.208, -0.576, -0.261, -0.418, -0.208, 2.103, -0.418, -0.471, 2.523, -0.576, -0.261, -0.523, -0.523, -0.523, -0.103, 2.313, -0.313, -0.523, -0.681, -0.523, 2.208, 2.366, -0.628, -0.576, -0.471, -0.366, 1.893, -0.261, -0.628, 2.523, -0.208, -0.156, -0.208, -0.366, -0.523, -0.628, -0.313, -0.208, -0.523, -0.208, -0.471, -0.576, -0.313, -0.261, -0.576, -0.313, -0.523, -0.156, 3.049, -0.261, -0.261, -0.576, -0.418, -0.313, -0.523, -0.471, -0.418, 2.628, -0.628, -0.471, 2.681, -0.523, -0.628, -0.418, -0.208, 2.156, -0.366, -0.261, -0.261, -0.471, -0.471, -0.313, -0.523, -0.471, -0.208, -0.208, -0.576, -0.156, -0.418, -0.523)
fifaRaw <- c(fifaRaw, 2.261, -0.366, -0.261, -0.103, 2.576, -0.103, -0.628, -0.418, -0.208, -0.208, -0.418, -0.471, -0.523, -0.261, -0.471, -0.418, -0.681, 2.786, -0.523, 2.471, -0.366, -0.628, -0.208, -0.471, -0.366, -0.261, -0.156, -0.208, -0.366, -0.681, -0.208, -0.471, 2.838, -0.523, -0.523, -0.208, -0.156, -0.366, -0.103, 2.156, -0.208, -0.208, -0.366, -0.156, -0.628, -0.523, 1.998, -0.576, -0.681, -0.418, -0.366, -0.418, -0.103, -0.103, 2.944, -0.576, -0.471, -0.366, -0.208, -0.366, -0.628, -0.681, -0.208, -0.628, -0.523, -0.208, -0.471, -0.628, -0.366, -0.418, 1.998, -0.523, -0.628, -0.681, -0.681, -0.261, 2.051, -0.208, 2.628, -0.418, -0.681, -0.471, -0.156, -0.208, -0.103, -0.418, -0.576, -0.261, -0.576, -0.576, -0.366, -0.523, -0.366, -0.156, 2.523, -0.471, -0.313, -0.313, -0.313, 3.101, 2.733, -0.208, -0.156, -0.523, -0.156, -0.471, -0.156, -0.366, -0.471, -0.576, -0.576, -0.628, -0.208, -0.576, 2.838, -0.681, -0.313, -0.366, -0.103, -0.208, 2.261, 2.681, 2.733, -0.366, -0.628, -0.261, -0.313, -0.313, -0.313, -0.471, -0.103, -0.313, 0.002, -0.313, -0.471, -0.156, -0.418, -0.156, -0.313, 2.681, -0.156, -0.523, -0.576, -0.418, -0.313, -0.523, -0.485, -0.323, -0.216, 1.987, -0.592, 3.115, -0.216, -0.538, -0.108, -0.699, -0.377, -0.538, -0.108, -0.485, -0.162, -0.216, -0.485, -0.323, -0.592, -0.162, 2.095, -0.538, 2.363, -0.431, -0.377, 2.256, -0.216, -0.377, -0.377, -0.27, -0.27, -0.646, -0.162, -0.162, 3.008, -0.485, -0.431, -0.162, -0.538, -0.485, -0.699, -0.323, 2.632, -0.699, -0.323, -0.592, -0.538, -0.216, 2.686, -0.699, -0.323, -0.592, -0.538, 2.739, -0.27, 2.793, -0.323, -0.699, 2.095, -0.162, -0.485, -0.377, -0.323, -0.377, -0.592, -0.323, -0.485, -0.162, -0.485, -0.592, -0.646, -0.377, -0.323, -0.377, -0.27, 2.578, -0.431, -0.162, -0.108, -0.485, -0.108, -0.592, -0.485, -0.538, 3.008, -0.162, -0.216, -0.431, -0.646, -0.108, -0.485, -0.431, -0.27, -0.431, -0.377, -0.216, -0.646, -0.162, -0.485, -0.431, -0.538, -0.431, -0.646, -0.485, -0.538, -0.377, -0.162, -0.162, -0.216, -0.162, 1.665, -0.216, -0.27, 1.826, -0.485, -0.216, -0.538, -0.538, -0.323, -0.162, -0.592, -0.108, 2.9, -0.216, 2.739, -0.323, -0.162, -0.216, -0.27, 2.471, -0.485, -0.162, -0.592, -0.592, -0.323, -0.485, -0.216, 1.88, -0.323, -0.216, -0.377, -0.323, -0.431, -0.162, -0.431, -0.216, -0.216, 2.954, -0.538, -0.216, -0.216, -0.377, -0.377, -0.27, -0.377, -0.27, -0.377, -0.216, -0.699, -0.646, -0.216, -0.485, -0.162, -0.431, 2.686, -0.431, -0.485, 2.847, 2.632, 2.847, -0.162, -0.323, -0.699, 2.202, -0.162, -0.592, -0.323, -0.377, -0.162, -0.216, -0.431, 2.202, -0.592, -0.27, -0.377, -0.216, -0.431, -0.27, -0.377, -0.162, -0.646, -0.27, 1.933, 2.578, -0.485, -0.323, -0.27, -0.216, -0.216, 2.471)
fifaRaw <- c(fifaRaw, 2.847, -0.646, -0.377, -0.377, -0.538, -0.377, -0.592, -0.216, -0.431, -0.27, -0.646, -0.538, -0.27, -0.108, -0.538, -0.27, -0.323, -0.538, -0.323, -0.485, -0.431, -0.592, -0.538, -0.485, -0.108, -0.216, -0.485, -0.592, -0.27, 2.256, -0.377, -0.323, -0.377, -0.323, 2.9, -0.538, -0.592, 2.202, -0.27, -0.538, 2.632, -0.27, -0.538, -0.27, -0.485, -0.377, -0.431, -0.27, -0.538, -0.216, -0.592, -0.162, -0.27, -0.538, -0.108, -0.216, -0.162, -0.162, 1.718, -0.162, -0.27, -0.162, 2.417, -0.431, -0.485, -0.485, -0.216, -0.323, -0.27, -0.592, -0.323, -0.377, -0.377, 2.632, -0.538, -0.646, -0.485, -0.485, -0.485, 2.686, -0.162, -0.108, 2.686, -0.646, -0.323, -0.377, -0.485, -0.592, -0.323, 2.309, -0.323, -0.323, -0.592, -0.431, 2.202, 2.202, -0.485, -0.377, -0.377, -0.216, 2.363, -0.377, -0.485, 2.148, -0.485, -0.323, -0.592, -0.162, -0.27, -0.485, -0.377, -0.646, -0.538, -0.377, -0.646, -0.216, -0.646, -0.592, -0.538, -0.323, -0.108, -0.431, 2.847, -0.592, -0.162, -0.485, -0.27, -0.108, -0.377, -0.431, -0.377, 3.115, -0.592, -0.323, 2.686, -0.646, -0.162, -0.538, -0.323, 1.826, -0.216, -0.592, -0.323, -0.431, -0.485, -0.162, -0.431, -0.108, -0.485, -0.216, -0.27, -0.538, -0.162, -0.377, 2.417, -0.592, -0.431, -0.27, 2.524, -0.538, -0.216, -0.27, -0.699, -0.646, -0.323, -0.431, -0.323, -0.377, -0.162, -0.216, -0.323, 2.524, -0.431, 2.578, -0.592, -0.431, -0.216, -0.108, -0.162, -0.485, -0.592, -0.216, -0.431, -0.377, -0.592, -0.377, 2.847, -0.592, -0.592, -0.538, -0.646, -0.323, -0.431, 2.471, -0.27, -0.377, -0.699, -0.323, -0.538, -0.592, 1.826, -0.699, -0.323, -0.592, -0.538, -0.485, -0.377, -0.323, 2.632, -0.592, -0.485, -0.646, -0.377, -0.485, -0.377, -0.538, -0.162, -0.646, -0.538, -0.27, -0.538, -0.592, -0.108, -0.216, 2.041, -0.162, -0.27, -0.377, -0.538, -0.108, 2.095, -0.592, 2.739, -0.646, -0.162, -0.323, -0.431, -0.27, -0.108, -0.108, -0.323, -0.377, -0.431, -0.377, -0.538, -0.538, -0.646, -0.592, 2.739, -0.108, -0.216, -0.108, -0.323, 3.115, 2.739, -0.377, -0.323, -0.216, -0.431, -0.431, -0.162, -0.646, -0.162, -0.646, -0.431, -0.431, -0.699, -0.431, 2.686, -0.162, -0.162, -0.646, -0.162, -0.108, 2.524, 2.9, 1.826, -0.538, -0.216, -0.485, -0.592, -0.162, -0.485, -0.377, -0.162, -0.592, -0.108, -0.431, -0.216, -0.699, -0.485, -0.27, -0.485, 2.471, -0.431, -0.485, -0.323, -0.538, -0.377, -0.162, -0.525, -0.25, -0.25, 1.999, -0.36, 2.328, -0.525, -0.47, -0.47, -0.25, -0.47, -0.305, -0.36, -0.579, -0.195, -0.579, -0.14, -0.14, -0.579, -0.305, 1.999, -0.525, 2.493, -0.305, -0.634, 2.219, -0.525, -0.525, -0.086, -0.525, -0.305, -0.47, -0.415, -0.525, 2.822, -0.47, -0.415, -0.14, -0.086, -0.36, -0.36, -0.415, 2.658, -0.634, -0.634, -0.525, -0.305, -0.086, 2.658, -0.579, -0.195, -0.086, -0.47)
fifaRaw <- c(fifaRaw, 3.371, -0.579, 3.042, -0.086, -0.36, 2.054, -0.36, -0.525, -0.25, -0.25, -0.634, -0.579, -0.634, -0.305, -0.14, -0.47, -0.415, -0.525, -0.415, -0.634, -0.25, -0.525, 3.316, -0.36, -0.25, -0.25, -0.525, -0.195, -0.47, -0.579, -0.579, 2.767, -0.47, -0.25, -0.525, -0.47, -0.525, -0.579, -0.634, -0.525, -0.086, -0.14, -0.579, -0.47, -0.14, -0.36, -0.579, -0.579, -0.525, -0.579, -0.195, -0.305, -0.305, -0.195, -0.415, -0.525, -0.47, 2.548, -0.689, -0.36, 1.999, -0.525, -0.579, -0.634, -0.47, -0.47, -0.47, -0.579, -0.14, 2.383, -0.579, 2.658, -0.305, -0.25, -0.25, -0.14, 2.822, -0.305, -0.14, -0.47, -0.579, -0.36, -0.25, -0.195, 1.89, -0.525, -0.47, -0.47, -0.25, -0.47, -0.305, -0.305, -0.195, -0.415, 3.206, -0.086, -0.634, -0.25, -0.525, -0.086, -0.579, -0.634, -0.47, -0.415, -0.25, -0.634, -0.14, -0.36, -0.195, -0.525, -0.305, 3.206, -0.305, -0.305, 2.822, 2.328, 2.767, -0.086, -0.25, -0.195, 2.274, -0.525, -0.195, -0.525, -0.579, -0.415, -0.25, -0.195, 1.56, -0.634, -0.415, -0.36, -0.525, -0.14, -0.415, -0.47, -0.195, -0.579, -0.305, 2.219, 2.548, -0.579, -0.195, -0.305, -0.415, -0.47, 2.822, 3.097, -0.195, -0.195, -0.305, -0.086, -0.525, -0.579, -0.47, -0.634, -0.36, -0.689, -0.525, -0.634, -0.36, -0.525, -0.525, -0.47, -0.305, -0.525, -0.305, -0.415, -0.415, -0.195, -0.086, -0.525, -0.305, -0.525, -0.305, -0.525, 1.89, -0.415, -0.47, -0.195, -0.36, 2.658, -0.415, -0.579, 2.219, -0.47, -0.195, 2.274, -0.36, -0.195, -0.086, -0.36, -0.195, -0.195, -0.14, -0.25, -0.579, -0.195, -0.525, -0.36, -0.47, -0.634, -0.47, -0.415, -0.36, 1.944, -0.086, -0.634, -0.25, 2.328, -0.525, -0.305, -0.47, -0.25, -0.525, -0.47, -0.305, -0.36, -0.579, -0.579, 2.767, -0.305, -0.195, -0.525, -0.579, -0.634, 2.438, -0.25, -0.634, 2.438, -0.36, -0.415, -0.195, -0.689, -0.14, -0.25, 2.493, -0.25, -0.305, -0.195, -0.25, 2.493, 2.219, -0.36, -0.47, -0.195, -0.25, 2.658, -0.25, -0.25, 1.725, -0.305, -0.14, -0.525, -0.47, -0.36, -0.36, -0.634, -0.525, -0.689, -0.195, -0.579, -0.36, -0.525, -0.305, -0.415, -0.579, -0.579, -0.195, 2.603, -0.25, -0.579, -0.415, -0.579, -0.36, -0.086, -0.525, -0.086, 2.164, -0.36, -0.195, 2.603, -0.579, -0.305, -0.36, -0.47, 1.999, -0.14, -0.36, -0.579, -0.47, -0.195, -0.579, -0.689, -0.47, -0.14, -0.634, -0.36, -0.634, -0.579, -0.525, 2.548, -0.36, -0.634, -0.305, 2.713, -0.579, -0.305, -0.305, -0.634, -0.195, -0.25, -0.36, -0.36, -0.47, -0.579, -0.36, -0.25, 2.658, -0.525, 2.713, -0.195, -0.25, -0.525, -0.195, -0.305, -0.195, -0.25, -0.36, -0.14, -0.579, -0.579, -0.634, 2.658, -0.47, -0.415, -0.14, -0.634, -0.195, -0.14, 2.493, -0.415, -0.47, -0.525, -0.25, -0.47, -0.634, 1.725, -0.415, -0.305, -0.415, -0.634, -0.634, -0.25, -0.14, 2.328, -0.195, -0.47, -0.086, -0.14, -0.47, -0.195)
fifaRaw <- c(fifaRaw, -0.689, -0.634, -0.086, -0.305, -0.195, -0.36, -0.305, -0.525, -0.579, 2.219, -0.25, -0.47, -0.579, -0.634, -0.086, 2.109, -0.195, 1.56, -0.36, -0.195, -0.47, -0.195, -0.14, -0.195, -0.086, -0.47, -0.36, -0.525, -0.14, -0.25, -0.36, -0.579, -0.579, 2.603, -0.47, -0.47, -0.579, -0.525, 2.767, 3.261, -0.36, -0.525, -0.305, -0.305, -0.47, -0.195, -0.25, -0.634, -0.47, -0.305, -0.305, -0.525, -0.305, 2.438, -0.634, -0.305, -0.579, -0.086, 0.683, 2.438, 2.603, 2.713, -0.47, -0.525, -0.25, -0.195, -0.634, -0.634, -0.305, -0.195, -0.47, -0.25, -0.415, -0.14, -0.305, -0.579, -0.25, -0.36, 2.713, -0.14, -0.305, -0.195, -0.47, -0.305, -0.25, -0.567, -0.356, -0.144, 2.074, -0.408, 3.078, -0.144, -0.62, -0.197, -0.408, -0.461, -0.356, -0.514, -0.408, -0.514, -0.461, -0.197, -0.461, -0.197, -0.144, 2.022, -0.197, 2.391, -0.303, -0.514, 2.339, -0.567, -0.461, -0.62, -0.408, -0.197, -0.25, -0.144, -0.25, 3.237, -0.303, -0.567, -0.303, -0.408, -0.514, -0.197, -0.25, 2.603, -0.25, -0.356, -0.408, -0.567, -0.197, 2.708, -0.62, -0.144, -0.514, -0.356, 2.761, -0.356, 2.708, -0.567, -0.514, 2.497, -0.567, -0.091, -0.356, -0.567, -0.144, -0.197, -0.144, -0.197, -0.408, -0.62, -0.303, -0.197, -0.144, -0.567, -0.567, -0.514, 2.708, -0.408, -0.356, -0.62, -0.356, -0.567, -0.567, -0.356, -0.408, 2.92, -0.091, -0.197, -0.356, -0.408, -0.461, -0.25, -0.514, -0.303, -0.144, -0.144, -0.408, -0.673, -0.25, -0.303, -0.303, -0.408, -0.567, -0.25, -0.25, -0.197, -0.197, -0.303, -0.514, -0.514, -0.197, 2.286, -0.25, -0.62, 1.81, -0.567, -0.514, -0.356, -0.303, -0.25, -0.25, -0.091, -0.356, 2.603, -0.567, 2.814, -0.25, -0.567, -0.356, -0.514, 2.444, -0.197, -0.303, -0.567, -0.514, -0.567, -0.25, -0.356, 2.603, -0.356, -0.303, -0.408, -0.197, -0.673, -0.25, -0.62, -0.303, -0.25, 3.025, -0.303, -0.62, -0.461, -0.461, -0.514, -0.567, -0.514, -0.461, -0.514, -0.356, -0.514, -0.514, -0.144, -0.567, -0.303, -0.144, 2.497, -0.303, -0.461, 2.603, 2.656, 2.814, -0.091, -0.303, -0.567, 2.074, -0.514, -0.091, -0.461, -0.356, -0.197, -0.303, -0.567, 1.388, -0.673, -0.144, -0.461, -0.567, -0.25, -0.567, -0.461, -0.144, -0.567, -0.197, 1.757, 2.444, -0.303, -0.461, -0.356, -0.197, -0.62, 2.391, 2.867, -0.25, -0.25, -0.144, -0.62, -0.62, -0.25, -0.197, -0.197, -0.514, -0.303, -0.673, -0.461, -0.197, -0.408, -0.514, -0.673, -0.461, -0.303, -0.408, -0.408, -0.408, -0.567, -0.303, -0.514, -0.197, -0.144, -0.197, -0.62, 2.127, -0.461, -0.567, -0.303, -0.144, 2.814, -0.461, -0.356, 2.127, -0.62, -0.408, 2.286, -0.461, -0.62, -0.461, -0.408, -0.144, -0.408, -0.303, -0.25, -0.25, -0.197, -0.197, -0.303, -0.144, -0.408, -0.673, -0.461, -0.197, 1.599, -0.461, -0.62, -0.303, 2.55, -0.197, -0.514, -0.303, -0.514, -0.62, -0.62, -0.62, -0.673)
fifaRaw <- c(fifaRaw, -0.408, -0.567, 2.603, -0.197, -0.144, -0.356, -0.25, -0.62, 3.025, -0.62, -0.567, 2.708, -0.62, -0.144, -0.197, -0.567, -0.25, -0.197, 2.708, -0.197, -0.408, -0.673, -0.303, 2.391, 2.233, -0.197, -0.303, -0.461, -0.62, 2.497, -0.303, -0.408, 2.444, -0.408, -0.567, -0.25, -0.197, -0.303, -0.25, -0.408, -0.514, -0.408, -0.197, -0.356, -0.408, -0.408, -0.25, -0.62, -0.567, -0.567, -0.25, 2.761, -0.461, -0.197, -0.514, -0.356, -0.25, -0.461, -0.303, -0.408, 2.814, -0.303, -0.673, 2.55, -0.356, -0.356, -0.144, -0.356, 1.863, -0.091, -0.567, -0.514, -0.567, -0.303, -0.514, -0.197, -0.144, -0.408, -0.514, -0.62, -0.303, -0.62, -0.144, 2.339, -0.62, -0.62, -0.408, 3.025, -0.567, -0.197, -0.514, -0.303, -0.567, -0.356, -0.25, -0.461, -0.356, -0.62, -0.356, -0.408, 2.603, -0.62, 2.391, -0.356, -0.461, -0.144, -0.356, -0.673, -0.303, -0.461, -0.303, -0.567, -0.303, -0.25, -0.673, 2.761, -0.62, -0.408, -0.303, -0.62, -0.461, -0.567, 2.127, -0.303, -0.25, -0.673, -0.197, -0.408, -0.144, 1.652, -0.514, -0.567, -0.303, -0.303, -0.514, -0.408, -0.197, 2.444, -0.197, -0.144, -0.303, -0.461, -0.303, -0.144, -0.25, -0.62, -0.408, -0.197, -0.197, -0.144, -0.567, -0.197, -0.567, 2.074, -0.303, -0.144, -0.356, -0.25, -0.303, 1.863, -0.567, 2.761, -0.62, -0.62, -0.25, -0.356, -0.514, -0.514, -0.197, -0.197, -0.567, -0.567, -0.461, -0.461, -0.408, -0.408, -0.408, 2.656, -0.356, -0.567, -0.567, -0.356, 3.237, 2.761, -0.567, -0.303, -0.461, -0.303, -0.356, -0.144, -0.144, -0.461, -0.356, -0.567, -0.62, -0.514, -0.356, 2.814, -0.356, -0.356, -0.25, -0.567, -0.197, 2.286, 2.603, 2.603, -0.567, -0.461, -0.356, -0.461, -0.303, -0.091, -0.514, -0.514, -0.144, -0.144, -0.197, -0.197, -0.567, -0.567, -0.197, -0.356, 2.339, -0.461, -0.144, -0.408, -0.197, -0.461, -0.567, -0.36, -0.308, -0.36, 2.239, -0.672, 3.487, -0.204, -0.308, -0.62, -0.412, -0.568, -0.256, -0.204, -0.204, -0.464, -0.412, -0.568, -0.36, -0.1, -0.412, 2.135, -0.62, 2.447, -0.152, -0.308, 2.447, -0.62, -0.308, -0.256, -0.412, -0.464, -0.412, -0.516, -0.204, 3.435, -0.308, -0.516, -0.256, -0.36, -0.464, -0.256, -0.464, 2.551, -0.568, -0.412, -0.568, -0.204, -0.152, 2.603, -0.308, -0.62, -0.204, -0.204, 3.123, -0.62, 2.915, -0.1, -0.36, 1.771, -0.256, -0.36, -0.256, -0.464, -0.36, -0.568, -0.412, -0.36, -0.568, -0.568, -0.412, -0.204, -0.464, -0.204, -0.412, -0.204, 2.811, -0.412, -0.516, -0.36, -0.464, -0.308, -0.152, -0.308, -0.204, 3.019, -0.464, -0.152, -0.256, -0.412, -0.412, -0.204, -0.568, -0.308, -0.256, -0.568, -0.204, -0.308, -0.36, -0.568, -0.464, -0.308, -0.516, -0.308, -0.568, -0.256, -0.204, -0.516, -0.204, -0.516, -0.1, 2.083, -0.568, -0.568, 2.187, -0.568, -0.568, -0.256, -0.62, -0.204, -0.36, -0.256, -0.256, 2.759, -0.152, 2.499, -0.62, -0.204, -0.62)
fifaRaw <- c(fifaRaw, -0.36, 2.499, -0.568, -0.412, -0.412, -0.204, -0.308, -0.256, -0.256, 2.187, -0.412, -0.62, -0.672, -0.204, -0.568, -0.464, -0.204, -0.256, -0.568, 3.071, -0.256, -0.256, -0.516, -0.308, -0.152, -0.1, -0.516, -0.308, -0.568, -0.308, -0.568, -0.308, -0.36, -0.516, -0.1, -0.412, 2.707, -0.256, -0.412, 2.967, 2.655, 2.499, -0.308, -0.568, -0.256, 2.499, -0.308, -0.152, -0.516, -0.256, -0.36, -0.204, -0.672, 1.668, -0.516, -0.152, -0.1, -0.36, -0.256, -0.62, -0.464, -0.568, -0.516, -0.308, 1.875, 2.395, -0.204, -0.568, -0.516, -0.464, -0.308, 2.551, 2.759, -0.516, -0.36, -0.62, -0.204, -0.204, -0.464, -0.308, -0.568, -0.516, -0.36, -0.412, -0.152, -0.464, -0.412, -0.36, -0.464, -0.412, -0.256, -0.412, -0.204, -0.256, -0.36, -0.308, -0.412, -0.204, -0.464, -0.36, -0.204, 1.771, -0.464, -0.62, -0.568, -0.36, 2.603, -0.464, -0.256, 2.135, -0.412, -0.204, 2.395, -0.62, -0.62, -0.516, -0.36, -0.204, -0.308, -0.464, -0.308, -0.62, -0.568, -0.204, -0.568, -0.464, -0.412, -0.62, -0.568, -0.204, 1.46, -0.412, -0.204, -0.36, 2.343, -0.412, -0.464, -0.36, -0.36, -0.256, -0.516, -0.308, -0.204, -0.152, -0.204, 3.071, -0.1, -0.308, -0.516, -0.256, -0.1, 1.979, -0.152, -0.152, 2.499, -0.36, -0.36, -0.412, -0.36, -0.204, -0.1, 2.291, -0.308, -0.152, -0.308, -0.36, 2.187, 2.395, -0.464, -0.464, -0.308, -0.256, 2.343, -0.412, -0.568, 2.343, -0.516, -0.568, -0.256, -0.308, -0.412, -0.516, -0.62, -0.464, -0.308, -0.568, -0.464, -0.152, -0.464, -0.62, -0.464, -0.464, -0.204, -0.672, 3.019, -0.62, -0.204, -0.256, -0.568, -0.36, -0.412, -0.308, -0.412, 2.655, -0.464, -0.672, 2.863, -0.568, -0.308, -0.412, -0.204, 1.927, -0.1, -0.36, -0.62, -0.516, -0.568, -0.256, -0.516, -0.516, -0.152, -0.62, -0.568, -0.204, -0.464, -0.152, 2.447, -0.36, -0.36, -0.36, 2.655, -0.412, -0.308, -0.412, -0.204, -0.36, -0.412, -0.412, -0.412, -0.568, -0.516, -0.256, -0.62, 2.759, -0.516, 2.603, -0.36, -0.36, -0.204, -0.152, -0.464, -0.204, -0.1, -0.308, -0.412, -0.308, -0.36, -0.516, 2.915, -0.62, -0.204, -0.412, -0.62, -0.308, -0.152, 2.187, -0.568, -0.36, -0.308, -0.308, -0.256, -0.464, 1.875, -0.516, -0.516, -0.62, -0.568, -0.36, -0.36, -0.1, 2.811, -0.672, -0.204, -0.568, -0.412, -0.516, -0.464, -0.308, -0.36, -0.308, -0.36, -0.36, -0.204, -0.36, -0.62, -0.516, 1.927, -0.256, -0.62, -0.62, -0.62, -0.568, 2.187, -0.568, 2.603, -0.62, -0.568, -0.308, -0.62, -0.516, -0.152, -0.256, -0.36, -0.1, -0.464, -0.256, -0.308, -0.568, -0.308, -0.516, 2.759, -0.36, -0.412, -0.568, -0.412, 3.123, 2.759, -0.308, -0.36, -0.152, -0.204, -0.62, -0.516, -0.204, -0.152, -0.152, -0.516, -0.62, -0.516, -0.256, 2.863, -0.412, -0.412, -0.36, -0.412, -0.204, 2.395, 2.135, 2.499, -0.36, -0.464, -0.204, -0.308, -0.568, -0.412, -0.256, -0.516, -0.256)
fifaRaw <- c(fifaRaw, -0.152, -0.568, -0.672, -0.62, -0.516, -0.308, -0.568, 1.823, -0.36, -0.516, -0.36, -0.464, -0.568, -0.256, -0.077, -0.31, -0.205, -0.373, -0.371, 1.509, -0.278, -0.36, -0.282, -0.251, -0.378, -0.26, -0.319, -0.273, 1.145, -0.382, -0.346, 2.421, -0.205, -0.389, -0.373, -0.328, -0.328, -0.395, -0.31, -0.367, -0.282, -0.278, -0.305, 0.47, -0.389, -0.375, -0.333, -0.314, 1.236, -0.356, -0.333, -0.077, -0.223, 0.123, 0.871, -0.205, -0.292, -0.205, 0.214, 0.871, 0.178, -0.241, -0.384, -0.356, 1.418, -0.26, -0.31, 0.251, -0.319, -0.041, 0.506, -0.333, -0.384, -0.187, -0.296, 0.78, -0.205, -0.187, -0.342, -0.187, -0.342, -0.364, -0.31, -0.241, -0.356, -0.31, -0.287, 13.545, -0.241, -0.187, -0.132, -0.346, -0.278, 1.145, 0.014, -0.305, -0.282, -0.342, -0.323, -0.319, -0.391, -0.373, -0.364, -0.114, 0.397, -0.314, -0.255, -0.282, -0.205, -0.205, -0.328, 0.78, -0.273, -0.278, -0.241, -0.132, -0.223, -0.382, -0.278, -0.362, -0.382, -0.292, -0.223, 0.324, -0.376, -0.041, 0.689, -0.378, -0.301, 0.689, -0.333, -0.373, -0.305, -0.187, -0.351, -0.187, -0.077, 0.251, -0.367, -0.391, 0.087, 1.236, 0.397, -0.342, -0.314, -0.369, -0.333, -0.333, -0.251, -0.346, -0.346, -0.369, -0.391, -0.255, 0.16, -0.228, 5.339, 0.36, -0.354, -0.077, -0.023, 0.306, -0.004, -0.301, -0.356, -0.296, -0.273, -0.077, 0.506, -0.205, -0.287, 0.251, -0.187, 0.214, -0.237, -0.168, -0.337, -0.282, -0.278, -0.223, -0.333, 0.196, -0.323, -0.328, -0.373, 0.269, -0.351, -0.354, -0.31, -0.337, -0.187, -0.319, -0.346, -0.328, -0.346, -0.4, -0.296, -0.059, -0.31, -0.168, -0.077, -0.323, -0.382, -0.375, -0.168, -0.114, -0.395, -0.31, -0.384, -0.292, -0.187, -0.296, 0.488, -0.232, -0.132, -0.387, -0.319, -0.273, -0.333, -0.36, -0.114, -0.273, 0.36, 0.415, -0.187, -0.305, 0.251, -0.385, 1.509, -0.319, -0.223, -0.223, -0.346, -0.387, -0.15, -0.382, -0.319, -0.391, 0.78, -0.205, -0.205, -0.301, -0.278, -0.376, -0.342, -0.305, -0.264, 1.418, -0.041, -0.187, 0.36, -0.358, 0.16, -0.319, -0.301, -0.31, 0.597, -0.358, -0.041, -0.319, -0.278, -0.38, -0.387, -0.323, 0.178, -0.041, -0.205, -0.023, -0.292, -0.337, 0.397, -0.251, -0.395, -0.387, -0.38, -0.378, -0.282, -0.387, -0.228, -0.278, -0.228, -0.246, 2.695, -0.269, -0.364, 5.795, 1.236, 0.306, -0.333, -0.36, -0.384, -0.237, 1.053, -0.395, -0.342, 0.178, -0.323, 0.379, 1.6, -0.333, 1.236, -0.077, 0.597, -0.333, -0.168, -0.241, 0.78, -0.367, -0.393, -0.328, -0.387, 0.506, 0.506, -0.342, -0.382, 6.251, -0.337, -0.323, -0.278, -0.269, 0.324, 4.609, -0.26, -0.296, -0.132, -0.246, -0.273, 0.597, -0.205, -0.351, -0.391, -0.041, -0.223, -0.36, -0.351, 0.105, 0.269, -0.296, 0.488, 2.695, -0.385, 1.783, -0.205, -0.376)
fifaRaw <- c(fifaRaw, -0.287, -0.223, -0.337, -0.333, -0.096, -0.269, -0.353, -0.246, -0.305, -0.384, -0.356, 0.78, -0.246, 0.142, -0.26, -0.205, -0.384, -0.228, 0.196, -0.31, 1.145, -0.38, -0.205, 0.324, -0.323, -0.387, -0.319, -0.241, -0.36, -0.393, 0.214, 0.032, -0.367, -0.223, -0.15, -0.353, 0.287, 1.874, -0.31, -0.26, -0.187, -0.36, -0.351, -0.251, -0.241, -0.376, -0.337, 0.196, -0.059, -0.38, -0.269, -0.342, -0.237, -0.328, -0.273, -0.38, 0.105, -0.168, -0.114, 0.397, -0.389, 0.032, -0.31, -0.38, 1.053, 0.105, -0.077, 0.962, 1.053, -0.389, -0.395, 0.78, -0.369, -0.365, -0.282, -0.264, -0.342, 0.689, -0.223, -0.305, -0.059, -0.395, 0.251, -0.292, -0.096, 0.452, -0.369, -0.255, -0.264, 2.421, -0.314, -0.337, -0.31, -0.168, -0.396, -0.351, -0.023, -0.269, 0.433, -0.362, -0.378, -0.282, -0.36, -0.382, -0.232, -0.278, 0.306, -0.255, 0.689, 0.014, -0.168, -0.328, -0.314, 0.324, -0.365, -0.369, -0.337, 0.597, -0.282, -0.328, -0.287, -0.228, -0.023, 0.871, 0.069, -0.365, -0.354, -0.323, 0.306, -0.396, -0.378, -0.26, -0.273, 0.689, -0.301, -0.393, 0.306, -0.292, -0.132, -0.31, 0.597, -0.369, -0.255, -0.351, -0.358, -0.337, -0.358, 8.895, 0.306, -0.319, 0.178, -0.393, -0.077, -0.31, -0.38, 0.36, 0.105, -0.337, -0.205, -0.337, -0.375, 1.509, -0.205, -0.389, 0.397, -0.389, 0.105, 1.418, -0.328, -0.365)
fifa19mtx <- matrix(data=fifaRaw, ncol=37, nrow=500, byrow=FALSE)
fifa19_scaled <- as.data.frame(fifa19mtx)
names(fifa19_scaled) <- c('Age', 'Potential', 'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes', 'PlayerValue')
str(fifa19_scaled)
# Glimpse at the dataset
glimpse(fifa19_scaled)
coefs <- data.frame(OLS=as.vector(lm(PlayerValue ~ . -Interceptions, data=fifa19_scaled)$coef[-1]))
coefs
# Ridge regression: mdlRidge
mdlRidge <- caret::train(PlayerValue ~ ., data = fifa19_scaled, method = "ridge", tuneLength = 8)
# Plot ridge train object
plot(mdlRidge)
# Ridge regression coefficients
coefRidge <- predict(mdlRidge$finalModel, type='coef', mode='norm')$coefficients
coefs$RidgeAll <- coefRidge[nrow(coefRidge),]
print(coefs)
# Lasso regression: mdlLasso
mdlLasso <- caret::train(PlayerValue ~ ., data = fifa19_scaled, method = "lasso", tuneLength = 8)
# Plot lasso object
plot(mdlLasso)
# Get coefficients in every step: coefLasso
coefLasso <- predict(mdlLasso$finalModel, type='coef', mode='norm')$coefficients
# Get coefficients for top 5 and all variables
(coefs$LassoTop5 <- coefLasso[6, ])
(coefs$LassoAll <- coefLasso[nrow(coefLasso), ])
# ElasticNet regression: mdlElasticNet
mdlElasticNet <- caret::train(PlayerValue ~ ., data = fifa19_scaled, method = "enet", tuneLength = 8)
# Plot elastic net object
plot(mdlElasticNet)
# Get elastic net coefficients: coefElasticNet
coefElasticNet <- predict(mdlElasticNet$finalModel, type="coef", mode="norm")$coefficients
# Get coefficients for top 5 and all variables
(coefs$ElasticNetTop5 <- coefElasticNet[6, ])
(coefs$ElasticNetAll <- coefElasticNet[nrow(coefElasticNet), ])
# Fit MLP using nnet: mdlNNet
# set.seed(124)
# mdlNNet <- nnet(Class ~ ., data = pulsar_train, size = 3)
# Calculate train error: train_error
# pred_train <- predict(mdlNNet, pulsar_train, type="class")
# train_cm <- table(pred_train, pulsar_train$Class)
# (train_error <- 1 - sum(diag(train_cm)) / sum(train_cm))
# Calculate test error: test_error
# pred_test <- predict(mdlNNet, pulsar_test, type="class")
# test_cm <- table(pred_test, pulsar_test$Class)
# (test_error <- 1 - sum(diag(test_cm)) / sum(test_cm))
# Fit MLP using nnet: mdlNNet
# set.seed(124)
# mdlNNet <- nnet(Class ~ ., data = pulsar_train, size = 5)
# Calculate train error: train_error
# pred_train <- predict(mdlNNet, pulsar_train, type="class")
# train_cm <- table(pred_train, pulsar_train$Class)
# (train_error <- 1 - sum(diag(train_cm)) / sum(train_cm))
# Calculate test error: test_error
# pred_test <- predict(mdlNNet, pulsar_test, type="class")
# test_cm <- table(pred_test, pulsar_test$Class)
# (test_error <- 1 - sum(diag(test_cm)) / sum(test_cm))
# Create the 5-fold cross validation training control object
# control <- trainControl(method = "cv", number = 5, savePredictions = TRUE, classProbs = TRUE)
# Create the vector of base learners: baseLearners
# baseLearners <- c('rpart', 'glm', 'knn', 'svmRadial')
# Create and summarize the list of base learners: models
# models <- caretList(Class ~ ., data = training, trControl = control, methodList = baseLearners)
# summary(models)
# Classification results in each resample: results
# results <- resamples(models)
# Summarize and print the results in one line
# (results_summary <- summary(results))
# Show the correlation among the base learners' results
# modelCor(results)
# Display a scatter plot matrix of these results
# splom(results)
# Load caretEnsemble
# library(caretEnsemble)
# Set the seed
# set.seed(123)
# Stack the base learners
# stack.glm <- caretStack(models, method="glm", metric="Accuracy", trControl=control)
# Print the stacked model
# stack.glm
# Summarize the performance results for each base learner
# summary(results)
evaluateModel <- function(trainObject, testData) {
# Compute binary yes/no predictions and class probabilities
model_preds <- predict(trainObject, testData)
model_probs <- predict(trainObject, testData, type="prob")
# Compute accuracy and AUC values
model_acc <- accuracy(testData$Class, model_preds)
model_auc <- auc(testData$Class == 'yes', model_probs[, 2])
# Return model accuracy and AUC
c(model_acc, model_auc)
}
# Evaluate the performance of each individual base learner
# baseLearnerStats <- sapply(X=stack.glm$models, FUN=evaluateModel, testing)
# baseLearnerDF <- data.frame(baseLearnerStats, row.names = c('acc', 'auc'))
# Compute stacked ensemble's accuracy on test data
# stack_preds <- predict(stack.glm, testing)
# stack_acc <- accuracy(testing$Class, stack_preds)
# Compute stacked ensemble's AUC on test data
# stack_preds_probs <- predict(stack.glm, testing, type="prob")
# stack_auc <- auc(testing$Class == 'yes', stack_preds_probs)
# Combine the stacked ensemble results
# (allLearnersDF <- cbind(baseLearnerDF, list(stack=c(stack_acc, stack_auc))))
Chapter 3 - Unsupervised Learning
K-means Clustering:
Clustering Algorithms:
Feature Selection:
Feature Extraction:
Example code includes:
mallData <- c(19, 21, 20, 23, 31, 22, 35, 23, 64, 30, 67, 35, 58, 24, 37, 22, 35, 20, 52, 35, 35, 25, 46, 31, 54, 29, 45, 35, 40, 23, 60, 21, 53, 18, 49, 21, 42, 30, 36, 20, 65, 24, 48, 31, 49, 24, 50, 27, 29, 31, 49, 33, 31, 59, 50, 47, 51, 69, 27, 53, 70, 19, 67, 54, 63, 18, 43, 68, 19, 32, 70, 47, 60, 60, 59, 26, 45, 40, 23, 49, 57, 38, 67, 46, 21, 48, 55, 22, 34, 50, 68, 18, 48, 40, 32, 24, 47, 27, 48, 20, 23, 49, 67, 26, 49, 21, 66, 54, 68, 66, 65, 19, 38, 19, 18, 19, 63, 49, 51, 50, 27, 38, 40, 39, 23, 31, 43, 40, 59, 38, 47, 39, 25, 31, 20, 29, 44, 32, 19, 35, 57, 32, 28, 32, 25, 28, 48, 32, 34, 34, 43, 39, 44, 38, 47, 27, 37, 30, 34, 30, 56, 29, 19, 31, 50, 36, 42, 33, 36, 32, 40, 28, 36, 36, 52, 30, 58, 27, 59, 35, 37, 32, 46, 29, 41, 30, 54, 28, 41, 36, 34, 32, 33, 38, 47, 35, 45, 32, 32, 30, 15000, 15000, 16000, 16000, 17000, 17000, 18000, 18000, 19000, 19000, 19000, 19000, 20000, 20000, 20000, 20000, 21000, 21000, 23000, 23000, 24000, 24000, 25000, 25000, 28000, 28000, 28000, 28000, 29000, 29000, 30000, 30000, 33000, 33000, 33000, 33000, 34000, 34000, 37000, 37000, 38000, 38000, 39000, 39000, 39000, 39000, 40000, 40000, 40000, 40000, 42000, 42000, 43000, 43000, 43000, 43000, 44000, 44000, 46000, 46000, 46000, 46000, 47000, 47000, 48000, 48000, 48000, 48000, 48000, 48000, 49000, 49000, 50000, 50000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 57000, 57000, 58000, 58000, 59000, 59000, 60000, 60000, 60000, 60000, 60000, 60000, 61000, 61000, 62000, 62000, 62000, 62000, 62000, 62000, 63000, 63000, 63000, 63000, 63000, 63000, 64000, 64000, 65000, 65000, 65000, 65000, 67000, 67000, 67000, 67000, 69000, 69000, 70000, 70000, 71000, 71000, 71000, 71000, 71000, 71000, 72000, 72000, 73000, 73000, 73000, 73000, 74000, 74000, 75000, 75000, 76000, 76000, 77000, 77000, 77000, 77000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 79000, 79000, 81000, 81000, 85000, 85000, 86000, 86000, 87000, 87000, 87000, 87000, 87000, 87000, 88000, 88000, 88000, 88000, 93000, 93000, 97000, 97000, 98000, 98000, 99000, 99000, 101000, 101000, 103000, 103000, 103000, 103000, 113000, 113000, 120000, 120000, 126000, 126000, 137000, 137000, 39, 81, 6, 77, 40, 76, 6, 94, 3, 72, 14, 99, 15, 77, 13, 79, 35, 66, 29, 98, 35, 73, 5, 73, 14, 82, 32, 61, 31, 87, 4, 73, 4, 92, 14, 81, 17, 73, 26, 75, 35, 92, 36, 61, 28, 65, 55, 47, 42, 42, 52, 60, 54, 60, 45, 41, 50, 46, 51, 46, 56, 55, 52, 59, 51, 59, 50, 48, 59, 47, 55, 42, 49, 56, 47, 54, 53, 48, 52, 42, 51, 55, 41, 44, 57, 46, 58, 55, 60, 46, 55, 41, 49, 40, 42, 52, 47, 50, 42, 49, 41, 48, 59, 55, 56, 42, 50, 46, 43, 48, 52, 54, 42, 46, 48, 50, 43, 59, 43, 57, 56, 40, 58, 91, 29, 77, 35, 95, 11, 75, 9, 75, 34, 71, 5, 88, 7, 73, 10, 72, 5, 93, 40, 87, 12, 97, 36, 74, 22, 90, 17, 88, 20, 76, 16, 89, 1, 78, 1, 73, 35, 83, 5, 93, 26, 75, 20, 95, 27, 63, 13, 75, 10, 92, 13, 86, 15, 69, 14, 90, 32, 86, 15, 88, 39, 97, 24, 68, 17, 85, 23, 69, 8, 91, 16, 79, 28, 74, 18, 83)
mall <- as.data.frame(matrix(data=mallData, ncol=3, byrow=FALSE))
names(mall) <- c("Age", "AnnualIncome", "SpendingScore")
# Glimpse over the mall data
glimpse(mall)
# Display the range of every variable
sapply(mall, range)
# Age histogram
hist(mall$Age, breaks=10)
# Spending score histogram
hist(mall$SpendingScore, breaks=10)
# Annual income histogram
hist(mall$AnnualIncome, breaks=10)
mall_scaled <- scale(mall)
# Initialize vector: ratios
ratios <- rep(0, 10)
# Try different values of K
for (k in 1:10) {
# Cluster mall: mall_c
mall_c <- kmeans(mall_scaled, k, nstart=20)
# Save the ratio WSS/TSS in the kth position of ratios
ratios[k] <- mall_c$tot.withinss / mall_c$totss
}
# Line plot with ratios as a function of k
plot(ratios, type="b", xlab="number of clusters")
# Cluster mall_scaled data using k = 6: mall_6
set.seed(123)
mall_6 <- kmeans(mall_scaled, centers=6, nstart=20)
# Average each variable per cluster
mall %>%
mutate(cluster = mall_6$cluster) %>%
group_by(cluster) %>%
summarize_all(list(~mean(.)))
library(clValid)
# Create the list of clustering methods: methods
methods <- c("hierarchical", "kmeans", "pam")
# Compare clustering methods: results
results <- clValid::clValid(mall_scaled, 2:10, clMethods = methods, validation = "internal")
# Summarize the results
summary(results)
# Create the list of clustering methods: methods
methods <- c("hierarchical", "kmeans", "pam")
# Compare clustering methods: results
results <- clValid(mall_scaled, 2:10, clMethods = methods, validation = "stability")
# Summarize the results
summary(results)
# Plot 3D mall_scaled data
plot3D::scatter3D(x = mall_scaled[, 1], y = mall_scaled[, 2], z = mall_scaled[, 3], col = "blue")
# Get K-means centroids for K = 7 and add them to the plot
km_centers <- results@clusterObjs$kmeans$`7`$centers
plot3D::points3D(km_centers[, 1], km_centers[, 2], km_centers[, 3], col = "red", pch=20, add=TRUE, cex=2.5)
# Get PAM's medoids for K = 7 and add them to the plot
pam_idxs <- results@clusterObjs$pam$'7'$medoids
pam_med <- mall_scaled[pam_idxs, ]
plot3D::points3D(pam_med[, 1], pam_med[, 2], pam_med[, 3], col = "green", pch=20, add=TRUE, cex=2.5)
appsOld <- apps
apps <- appsOld %>%
select(Rating, Reviews, Installs, Type, Price, `Content Rating`) %>%
rename(Content=`Content Rating`) %>%
mutate(HasPositiveReviews=TRUE, Price=as.numeric(gsub('\\$', '', Price))) %>%
filter(complete.cases(.))
# Glimpse at the data
glimpse(apps)
# Identify near-zero-variance predictors: nzv
nzv <- caret::nearZeroVar(apps, names=TRUE)
print(nzv)
# Frequency of the HasPositiveReviews attribute
table(apps$HasPositiveReviews)
# Frequency of the Price attribute
table(apps$Price)
# Remove these features: apps_clean
apps_clean <- apps %>%
select(-HasPositiveReviews, -Price)
# Glimpse at the fifa data
# glimpse(fifa)
# Are there zero or near-zero variance features?
# nearZeroVar(fifa)
# Highly correlated predictors: cor_90plus
# (cor_90plus <- findCorrelation(cor(fifa), names = TRUE))
# Highly correlated predictors (>= 98%): cor_98plus
# (cor_98plus <- findCorrelation(cor(fifa), names = TRUE, cutoff = 0.98))
# Remove cor_90plus features: fifa_clean
# fifa_clean <- fifa %>%
# select(-cor_90plus)
# Train model on original scaled data: mdl_orig
# mdl_orig <- train(Club ~ ., data = team_train, method="svmLinear2", trControl = trainCtrl)
# Predict on original test data: orig_preds, orig_probs
# orig_preds <- predict(mdl_orig, team_test)
# orig_probs <- predict(mdl_orig, team_test, type="prob")
# Compute and print the confusion matrix: cm_orig
# (cm_orig <- confusionMatrix(orig_preds, team_test$Club))
# Compute and print AUC: auc_orig
# (auc_orig <- auc(team_test$Club == 'Real.Madrid', orig_probs$'Real.Madrid'))
# Transform training and test data: train_pca, test_pca
# pca <- preProcess(x = team_train[, -match("Club", names(team_train))], method = "pca")
# train_pca <- predict(pca, team_train)
# test_pca <- predict(pca, team_test)
# Train model on PCA data: mdl_pca
# mdl_pca <- train(Club ~ ., data = train_pca, method = "svmLinear2", trControl = trainCtrl)
# Predict on PCA data: pca_preds, pca_probs
# pca_preds <- predict(mdl_pca, test_pca)
# pca_probs <- predict(mdl_pca, test_pca, type = "prob")
# Compute and print confusion matrix & AUC: cm_pca, auc_pca
# (cm_pca <- confusionMatrix(pca_preds, test_pca$Club))
# (auc_pca <- auc(test_pca$Club == 'Real.Madrid', pca_probs$'Real.Madrid'))
# Transform training and test data: train_lda, test_lda
# my_lda <- lda(Club ~ ., data = team_train)
# train_lda <- as.data.frame(predict(my_lda, team_train))
# test_lda <- as.data.frame(predict(my_lda, team_test))
# Train model on LDA-preprocessed data: mdl_lda
# mdl_lda <- train(class ~ ., data = train_lda, method="svmLinear2", trControl = trainCtrl)
# Predict on LDA-ed test data: lda_preds, lda_probs
# lda_preds <- predict(mdl_lda, test_lda)
# lda_probs <- predict(mdl_lda, test_lda, type="prob")
# Compute and print confusion matrix & AUC: cm_lda, auc_lda
# (cm_lda <- confusionMatrix(lda_preds, test_lda$class))
# (auc_lda <- auc(test_lda$class == 'Real.Madrid', lda_probs$Real.Madrid))
Chapter 4 - Model Evaluation
Model Evaluation:
Handling Imbalanced Data:
Hyperparameter Tuning:
Random Forests or Gradient Boosted Trees:
Wrap Up:
Example code includes:
apps <- appsOld %>%
select(Category, Rating, Reviews, Size, Installs, `Content Rating`) %>%
rename(Content.Rating=`Content Rating`) %>%
filter(complete.cases(.), Category %in% c("EDUCATION", "ENTERTAINMENT"),
Size!="Varies with device"
) %>%
mutate(Category=factor(Category), Installs=factor(Installs), Content.Rating=factor(Content.Rating))
appSize <- rep(NA, nrow(apps))
mbSize <- grep("^[0-9][0-9\\.]*M", apps$Size)
kbSize <- grep("^[0-9][0-9\\.]*k", apps$Size)
appSize[mbSize] <- as.numeric(gsub('M', '', apps$Size[mbSize]))
appSize[kbSize] <- as.numeric(gsub('k', '', apps$Size[kbSize])) / 1000
apps$Size <- appSize
glimpse(apps)
set.seed(1912261548)
trIndex <- sort(sample(1:nrow(apps), round(0.75*nrow(apps)), replace=FALSE))
training <- apps[trIndex, ]
testing <- apps[-trIndex, ]
cv10 <- caret::trainControl(method="cv", number=10, classProbs=TRUE,
summaryFunction=caret::twoClassSummary
)
# Create KNN model: mdlKNN
set.seed(123)
mdlKNN <- train(Category ~ ., data = training, method = "knn", trControl = cv10, metric="ROC")
# Print the KNN model and its confusion matrix
print(mdlKNN)
ModelMetrics::confusionMatrix(predict(mdlKNN, testing), testing$Category)
# Predict class labels and probs: knn_preds, knn_probs
knn_preds <- predict(mdlKNN, newdata = testing)
knn_probs <- predict(mdlKNN, newdata = testing, type="prob")
# Print accuracy and AUC values
print(Metrics::accuracy(testing$Category, knn_preds))
print(Metrics::auc(testing$Category == 'ENTERTAINMENT', knn_probs[, 2]))
# Train SVM: mdlSVM
# set.seed(123)
# mdlSVM <- train(Overall ~ ., data = training, method = "svmRadial", trControl = cv10)
# Print the SVM model
# print(mdlSVM)
# Predict overall score on testing data: svm_preds
# svm_preds <- predict(mdlSVM, newdata = testing)
# Print RMSE and MAE values
# print(rmse(testing$Overall, svm_preds))
# print(mae(testing$Overall, svm_preds))
# Glimpse at the data
glimpse(mall_scaled)
# Run DIANA: results
results <- clValid::clValid(mall_scaled, 2:10, clMethods = "diana", validation = "internal")
# Print and summarize results
print(results)
summary(results)
# Plot results
plot(results)
# Glimpse at the data
# glimpse(pulsar)
# Is there a class imbalance?
# table(pulsar$target_class)
# Set seed and partition data
# set.seed(123)
# inTrain <- createDataPartition(y = pulsar$target_class, p = .75, list = FALSE)
# training <- pulsar[inTrain,]
# testing <- pulsar[-inTrain,]
# Is there class imbalance in the training and test sets?
# table(training$target_class)
# table(testing$target_class)
trainDTree <- function(train_data, samplingMode = NULL) {
set.seed(123)
ctrl <- trainControl(method = "cv", number = 10, classProbs = TRUE,
summaryFunction = twoClassSummary, sampling = samplingMode
)
train(target_class ~ ., data = train_data, method = "rpart", metric = "ROC", trControl = ctrl)
}
# Train and print model with no subsampling: mdl_orig
# (mdl_orig <- trainDTree(training))
# Train model with downsampling: mdl_down
# (mdl_down <- trainDTree(training, samplingMode = "down"))
# Train model with upsampling: mdl_up
# (mdl_up <- trainDTree(training, samplingMode = "up"))
# Train model with SMOTE: mdl_smote
# (mdl_smote <- trainDTree(training, samplingMode = "smote"))
get_auc <- function(model, data) {
library(Metrics)
preds <- predict(model, data, type = "prob")[, "yes"]
auc(data$target_class == "yes", preds)
}
# Create model list: mdl_list
# mdl_list <- list(orig = mdl_orig, down = mdl_down, up = mdl_up, smote = mdl_smote)
# Compute AUC on training subsamples: resampling
# resampling <- resamples(mdl_list)
# summary(resampling, metric="ROC")
# Compute AUC on test data: auc_values
# auc_values <- sapply(mdl_list, FUN=get_auc, data = testing)
# print(auc_values)
set.seed(1912261602)
carIdx <- sort(sample(1:nrow(car), round(0.75*nrow(car)), replace=FALSE))
car_train <- car[carIdx, ]
car_test <- car[-carIdx, ]
# Set up train control: trc
trc <- caret::trainControl(method = "repeatedcv", number = 3, repeats = 5)
# Train model: svmr
svmr <- caret::train(consume ~ ., data = car_train, method = "svmRadial", trControl = trc)
# Print and plot SVM model
print(svmr)
plot(svmr)
# Set up train control: trc
trc <- caret::trainControl(method = "cv", number = 10)
# Create custom hyperparameter grid: hp_grid
hp_grid <- expand.grid(C = seq(from=0.2, to=1.0, by=0.2), sigma = c(0.35, 0.6, 0.75))
# Train model: svmr
svmr <- caret::train(consume ~ ., data = car_train, method = "svmRadial", trControl = trc, tuneGrid = hp_grid)
# Print and plot SVM model
print(svmr)
plot(svmr)
# Set random seed
set.seed(42)
# Set up train control: trc
trc <- caret::trainControl(method = "cv", number = 10, search = "random")
# Train model: svmr
svmr <- caret::train(consume ~ ., data = car_train, method = "svmRadial", trControl = trc, tuneLength = 10)
# Print and plot SVM model
print(svmr)
plot(svmr)
# Train the RF model: mdlRF
mdlRF <- randomForest::randomForest(formula = Rating ~ ., data = training, ntree = 500)
# Print the RF model
print(mdlRF)
# RF variable importance
randomForest::varImpPlot(mdlRF)
print(mdlRF$importance)
# Train a GBM model with 500 trees: mdlGBM
mdlGBM <- gbm::gbm(formula = Rating ~ ., data = training, n.trees = 500)
# Print GBM model
print(mdlGBM)
# Summarize GBM's variable importance
summary(mdlGBM)
# Predict on the testing data: gbm_preds, rf_preds
gbm_preds <- predict(mdlGBM, n.trees = 500, newdata = testing)
rf_preds <- predict(mdlRF, newdata = testing)
# RMSE metric for both models: gbm_rmse, rf_rmse
(gbm_rmse <- Metrics::rmse(testing$Rating, gbm_preds))
(rf_rmse <- Metrics::rmse(testing$Rating, rf_preds))
# RRSE metric for both models: gbm_rrse, rf_rrse
(gbm_rrse <- Metrics::rrse(testing$Rating, gbm_preds))
(rf_rrse <- Metrics::rrse(testing$Rating, rf_preds))
Chapter 1 - True Fundamentals
Regular Expression Basics:
Tokenization:
Text Cleaning Basics:
Example code includes:
text <- c("John's favorite color two colors are blue and red.", "John's favorite number is 1111.", 'John lives at P Sherman, 42 Wallaby Way, Sydney', 'He is 7 feet tall', 'John has visited 30 countries', 'John only has nine fingers.', 'John has worked at eleven different jobs', 'He can speak 3 languages', "john's favorite food is pizza", 'John can name 10 facts about himself.')
# Print off each item that contained a numeric number
grep(pattern = "\\d", x = text, value = TRUE)
# Find all items with a number followed by a space
grep(pattern = "\\d\\s", x = text)
# How many times did you write down 'favorite'?
length(grep(pattern = "favorite", x = text))
# Print off the text for every time you used your boss's name, John
grep('John', x = text, value = TRUE)
# Try replacing all occurences of "John" with "He"
gsub(pattern = 'John', replacement = 'He ', x = text)
# Replace all occurences of "John " with 'He '.
clean_text <- gsub(pattern = 'John\\s', replacement = 'He ', x = text)
clean_text
# Replace all occurences of "John's" with 'His'
gsub(pattern = "John\\'s", replacement = 'His', x = clean_text)
animal_farm <- read_csv("./RInputFiles/animal_farm.csv")
str(animal_farm)
# Split the text_column into sentences
animal_farm %>%
tidytext::unnest_tokens(output = "sentences", input = text_column, token = "sentences") %>%
# Count sentences, per chapter
count(chapter)
# Split the text_column using regular expressions
animal_farm %>%
tidytext::unnest_tokens(output = "sentences", input = text_column, token = "regex", pattern = "\\.") %>%
count(chapter)
# Tokenize animal farm's text_column column
tidy_animal_farm <- animal_farm %>%
tidytext::unnest_tokens(word, text_column)
# Print the word frequencies
tidy_animal_farm %>%
count(word, sort = TRUE)
# Remove stop words, using stop_words from tidytext
str(tidy_animal_farm)
tidy_animal_farm <- tidy_animal_farm %>%
anti_join(tidytext::stop_words)
str(tidy_animal_farm)
# Perform stemming on tidy_animal_farm
stemmed_animal_farm <- tidy_animal_farm %>%
mutate(word = SnowballC::wordStem(word))
# Print the old word frequencies
tidy_animal_farm %>%
count(word, sort = TRUE)
# Print the new word frequencies
stemmed_animal_farm %>%
count(word, sort = TRUE)
Chapter 2 - Representations of Text
Understanding an R Corpus:
Bag-of-words Representation:
TFIDF - Term Frequency Inverse Document Frequency:
Cosine Similarity:
Example code includes:
crudeText <- c('Diamond Shamrock Corp said that\neffective today it had cut its contract prices for crude oil by\n1.50 dlrs a barrel.\n The reduction brings its posted price for West Texas\nIntermediate to 16.00 dlrs a barrel, the copany said.\n \"The price reduction today was made in the light of falling\noil product prices and a weak crude oil market,\" a company\nspokeswoman said.\n Diamond is the latest in a line of U.S. oil companies that\nhave cut its contract, or posted, prices over the last two days\nciting weak oil markets.\n Reuter')
crudeText <- c(crudeText, 'OPEC may be forced to meet before a\nscheduled June session to readdress its production cutting\nagreement if the organization wants to halt the current slide\nin oil prices, oil industry analysts said.\n \"The movement to higher oil prices was never to be as easy\nas OPEC thought. They may need an emergency meeting to sort out\nthe problems,\" said Daniel Yergin, director of Cambridge Energy\nResearch Associates, CERA.\n Analysts and oil industry sources said the problem OPEC\nfaces is excess oil supply in world oil markets.\n \"OPECs problem is not a price problem but a production\nissue and must be addressed in that way,\" said Paul Mlotok, oil\nanalyst with Salomon Brothers Inc.\n He said the markets earlier optimism about OPEC and its\nability to keep production under control have given way to a\npessimistic outlook that the organization must address soon if\nit wishes to regain the initiative in oil prices.\n But some other analysts were uncertain that even an\nemergency meeting would address the problem of OPEC production\nabove the 15.8 mln bpd quota set last December.\n \"OPEC has to learn that in a buyers market you cannot have\ndeemed quotas, fixed prices and set differentials,\" said the\nregional manager for one of the major oil companies who spoke\non condition that he not be named. \"The market is now trying to\nteach them that lesson again,\" he added.\n David T. Mizrahi, editor of Mideast reports, expects OPEC\nto meet before June, although not immediately. However, he is\nnot optimistic that OPEC can address its principal problems.\n \"They will not meet now as they try to take advantage of the\nwinter demand to sell their oil, but in late March and April\nwhen demand slackens,\" Mizrahi said.\n But Mizrahi said that OPEC is unlikely to do anything more\nthan reiterate its agreement to keep output at 15.8 mln bpd.\"\n Analysts said that the next two months will be critical for\nOPECs ability to hold together prices and output.\n \"OPEC must hold to its pact for the next six to eight weeks\nsince buyers will come back into the market then,\" said Dillard\nSpriggs of Petroleum Analysis Ltd in New York.\n But Bijan Moussavar-Rahmani of Harvard Universitys Energy\nand Environment Policy Center said that the demand for OPEC oil\nhas been rising through the first quarter and this may have\nprompted excesses in its production.\n \"Demand for their (OPEC) oil is clearly above 15.8 mln bpd\nand is probably closer to 17 mln bpd or higher now so what we\nare seeing characterized as cheating is OPEC meeting this\ndemand through current production,\" he told Reuters in a\ntelephone interview.\n Reuter')
crudeText <- c(crudeText, 'Texaco Canada said it lowered the\ncontract price it will pay for crude oil 64 Canadian cts a\nbarrel, effective today.\n The decrease brings the companys posted price for the\nbenchmark grade, Edmonton/Swann Hills Light Sweet, to 22.26\nCanadian dlrs a bbl.\n Texaco Canada last changed its crude oil postings on Feb\n19.\n Reuter')
crudeText <- c(crudeText, 'Marathon Petroleum Co said it reduced\nthe contract price it will pay for all grades of crude oil one\ndlr a barrel, effective today.\n The decrease brings Marathons posted price for both West\nTexas Intermediate and West Texas Sour to 16.50 dlrs a bbl. The\nSouth Louisiana Sweet grade of crude was reduced to 16.85 dlrs\na bbl.\n The company last changed its crude postings on Jan 12.\n Reuter')
crudeText <- c(crudeText, 'Houston Oil Trust said that independent\npetroleum engineers completed an annual study that estimates\nthe trusts future net revenues from total proved reserves at\n88 mln dlrs and its discounted present value of the reserves at\n64 mln dlrs.\n Based on the estimate, the trust said there may be no money\navailable for cash distributions to unitholders for the\nremainder of the year.\n It said the estimates reflect a decrease of about 44 pct in\nnet reserve revenues and 39 pct in discounted present value\ncompared with the study made in 1985.\n Reuter')
crudeText <- c(crudeText, 'Kuwait\"s Oil Minister, in remarks\npublished today, said there were no plans for an emergency OPEC\nmeeting to review oil policies after recent weakness in world\noil prices.\n Sheikh Ali al-Khalifa al-Sabah was quoted by the local\ndaily al-Qabas as saying: \"None of the OPEC members has asked\nfor such a meeting.\"\n He denied Kuwait was pumping above its quota of 948,000\nbarrels of crude daily (bpd) set under self-imposed production\nlimits of the 13-nation organisation.\n Traders and analysts in international oil markets estimate\nOPEC is producing up to one mln bpd above a ceiling of 15.8 mln\nbpd agreed in Geneva last December.\n They named Kuwait and the United Arab Emirates, along with\nthe much smaller producer Ecuador, among those producing above\nquota. Kuwait, they said, was pumping 1.2 mln bpd.\n \"This rumour is baseless. It is based on reports which said\nKuwait has the ability to exceed its share. They suppose that\nbecause Kuwait has the ability, it will do so,\" the minister\nsaid.\n Sheikh Ali has said before that Kuwait had the ability to\nproduce up to 4.0 mln bpd.\n \"If we can sell more than our quota at official prices,\nwhile some countries are suffering difficulties marketing their\nshare, it means we in Kuwait are unusually clever,\" he said.\n He was referring apparently to the Gulf state of qatar,\nwhich industry sources said was selling less than 180,000 bpd\nof its 285,000 bpd quota, because buyers were resisting\nofficial prices restored by OPEC last month pegged to a marker\nof 18 dlrs per barrel.\n Prices in New York last week dropped to their lowest levels\nthis year and almost three dollars below a three-month high of\n19 dollars a barrel.\n Sheikh Ali also delivered \"a challenge to any international\noil company that declared Kuwait sold below official prices.\"\n Because it was charging its official price, of 16.67 dlrs a\nbarrel, it had lost custom, he said but did not elaborate.\n However, Kuwait had guaranteed markets for its oil because\nof its local and international refining facilities and its own\ndistribution network abroad, he added.\n He reaffirmed that the planned meeting March 7 of OPEC\"s\ndifferentials committee has been postponed until the start of\nApril at the request of certain of the body\"s members.\n Ecuador\"s deputy energy minister Fernando Santos Alvite said\nlast Wednesday his debt-burdened country wanted OPEC to assign\na lower official price for its crude, and was to seek this at\ntalks this month of opec\"s pricing committee.\n Referring to pressure by oil companies on OPEC members, in\napparent reference to difficulties faced by Qatar, he said: \"We\nexpected such pressure. It will continue through March and\nApril.\" But he expected the situation would later improve.\n REUTER')
crudeText <- c(crudeText, 'Indonesia appears to be nearing a\npolitical crossroads over measures to deregulate its protected\neconomy, the U.S. Embassy says in a new report.\n To counter falling oil revenues, the government has\nlaunched a series of measures over the past nine months to\nboost exports outside the oil sector and attract new\ninvestment.\n Indonesia, the only Asian member of OPEC and a leading\nprimary commodity producer, has been severely hit by last year\"s\nfall in world oil prices, which forced it to devalue its\ncurrency by 31 pct in September.\n But the U.S. Embassy report says President Suharto\"s\ngovernment appears to be divided over what direction to lead\nthe economy.\n \"(It) appears to be nearing a crossroads with regard to\nderegulation, both as it pertains to investments and imports,\"\nthe report says. It primarily assesses Indonesia\"s agricultural\nsector, but also reviews the country\"s general economic\nperformance.\n It says that while many government officials and advisers\nare recommending further relaxation, \"there are equally strong\npressures being exerted to halt all such moves.\"\n \"This group strongly favours an import substitution economy,\"\nthe report says.\n Indonesia\"s economic changes have been welcomed by the World\nBank and international bankers as steps in the right direction,\nthough they say crucial areas of the economy like plastics and\nsteel remain highly protected, and virtual monopolies.\n Three sets of measures have been announced since last May,\nwhich broadened areas for foreign investment, reduced trade\nrestrictions and liberalised imports.\n The report says Indonesia\"s economic growth in calendar 1986\nwas probably about zero, and the economy may even have\ncontracted a bit. \"This is the lowest rate of growth since the\nmid-1960s,\" the report notes.\n Indonesia, the largest country in South-East Asia with a\npopulation of 168 million, is facing general elections in\nApril.\n But the report hold out little hope for swift improvement\nin the economic outlook. \"For 1987 early indications point to a\nslightly positive growth rate not exceeding one pct. Economic\nactivity continues to suffer due to the sharp fall in export\nearnings from the petroleum industry.\"\n \"Growth in the non-oil sector is low because of weak\ndomestic demand coupled with excessive plant capacity, real\ndeclines in construction and trade, and a reduced level of\ngrowth in agriculture,\" the report states.\n Bankers say continuation of present economic reforms is\ncrucial for the government to get the international lending its\nneeds.\n A new World Bank loan of 300 mln dlrs last month in balance\nof payments support was given partly to help the government\nmaintain the momentum of reform, the Bank said.\n REUTER')
crudeText <- c(crudeText, 'Saudi riyal interbank deposits were\nsteady at yesterdays higher levels in a quiet market.\n Traders said they were reluctant to take out new positions\namidst uncertainty over whether OPEC will succeed in halting\nthe current decline in oil prices.\n Oil industry sources said yesterday several Gulf Arab\nproducers had had difficulty selling oil at official OPEC\nprices but Kuwait has said there are no plans for an emergency\nmeeting of the 13-member organisation.\n A traditional Sunday lull in trading due to the European\nweekend also contributed to the lack of market activity.\n Spot-next and one-week rates were put at 6-1/4, 5-3/4 pct\nafter quotes ranging between seven, six yesterday.\n One, three, and six-month deposits were quoted unchanged at\n6-5/8, 3/8, 7-1/8, 6-7/8 and 7-3/8, 1/8 pct respectively.\n The spot riyal was quietly firmer at 3.7495/98 to the\ndollar after quotes of 3.7500/03 yesterday.\n REUTER')
crudeText <- c(crudeText, 'The Gulf oil state of Qatar, recovering\nslightly from last years decline in world oil prices,\nannounced its first budget since early 1985 and projected a\ndeficit of 5.472 billion riyals.\n The deficit compared with a shortfall of 7.3 billion riyals\nin the last published budget for 1985/86.\n In a statement outlining the budget for the fiscal year\n1987/88 beginning today, Finance and Petroleum Minister Sheikh\nAbdul-Aziz bin Khalifa al-Thani said the government expected to\nspend 12.217 billion riyals in the period.\n Projected expenditure in the 1985/86 budget had been 15.6\nbillion riyals.\n Sheikh Abdul-Aziz said government revenue would be about\n6.745 billion riyals, down by about 30 pct on the 1985/86\nprojected revenue of 9.7 billion.\n The government failed to publish a 1986/87 budget due to\nuncertainty surrounding oil revenues.\n Sheikh Abdul-Aziz said that during that year the government\ndecided to limit recurrent expenditure each month to\none-twelfth of the previous fiscal years allocations minus 15\npct.\n He urged heads of government departments and public\ninstitutions to help the government rationalise expenditure. He\ndid not say how the 1987/88 budget shortfall would be covered.\n Sheikh Abdul-Aziz said plans to limit expenditure in\n1986/87 had been taken in order to relieve the burden placed on\nthe countrys foreign reserves.\n He added in 1987/88 some 2.766 billion riyals had been\nallocated for major projects including housing and public\nbuildings, social services, health, education, transport and\ncommunications, electricity and water, industry and\nagriculture.\n No figure was revealed for expenditure on defence and\nsecurity. There was also no projection for oil revenue.\n Qatar, an OPEC member, has an output ceiling of 285,000\nbarrels per day.\n Sheikh Abdul-Aziz said: \"Our expectations of positive signs\nregarding (oil) price trends, foremost among them OPECs\ndetermination to shoulder its responsibilites and protect its\nwealth, have helped us make reasonable estimates for the coming\nyears revenue on the basis of our assigned quota.\"\n REUTER')
crudeText <- c(crudeText, 'Saudi Arabian Oil Minister Hisham Nazer\nreiterated the kingdoms commitment to last Decembers OPEC\naccord to boost world oil prices and stabilise the market, the\nofficial Saudi Press Agency SPA said.\n Asked by the agency about the recent fall in free market\noil prices, Nazer said Saudi Arabia \"is fully adhering by the\n... Accord and it will never sell its oil at prices below the\npronounced prices under any circumstance.\"\n Nazer, quoted by SPA, said recent pressure on free market\nprices \"may be because of the end of the (northern hemisphere)\nwinter season and the glut in the market.\"\n Saudi Arabia was a main architect of the December accord,\nunder which OPEC agreed to lower its total output ceiling by\n7.25 pct to 15.8 mln barrels per day (bpd) and return to fixed\nprices of around 18 dlrs a barrel.\n The agreement followed a year of turmoil on oil markets,\nwhich saw prices slump briefly to under 10 dlrs a barrel in\nmid-1986 from about 30 dlrs in late 1985. Free market prices\nare currently just over 16 dlrs.\n Nazer was quoted by the SPA as saying Saudi Arabias\nadherence to the accord was shown clearly in the oil market.\n He said contacts among members of OPEC showed they all\nwanted to stick to the accord.\n In Jamaica, OPEC President Rilwanu Lukman, who is also\nNigerian Oil Minister, said the group planned to stick with the\npricing agreement.\n \"We are aware of the negative forces trying to manipulate\nthe operations of the market, but we are satisfied that the\nfundamentals exist for stable market conditions,\" he said.\n Kuwaits Oil Minister, Sheikh Ali al-Khalifa al-Sabah, said\nin remarks published in the emirates daily Al-Qabas there were\nno plans for an emergency OPEC meeting to review prices.\n Traders and analysts in international oil markets estimate\nOPEC is producing up to one mln bpd above the 15.8 mln ceiling.\n They named Kuwait and the United Arab Emirates, along with\nthe much smaller producer Ecuador, among those producing above\nquota. Sheikh Ali denied that Kuwait was over-producing.\n REUTER')
crudeText <- c(crudeText, 'Saudi crude oil output last month fell\nto an average of 3.5 mln barrels per day (bpd) from 3.8 mln bpd\nin January, Gulf oil sources said.\n They said exports from the Ras Tanurah and Juaymah\nterminals in the Gulf fell to an average 1.9 mln bpd last month\nfrom 2.2 mln in January because of lower liftings by some\ncustomers.\n But the drop was much smaller than expected after Gulf\nexports rallied in the fourth week of February to 2.5 mln bpd\nfrom 1.2 mln in the third week, the sources said.\n The production figures include neutral zone output but not\nsales from floating storage, which are generally considered\npart of a countrys output for Opec purposes.\n Saudi Arabia has an Opec quota of 4.133 mln bpd under a\nproduction restraint scheme approved by the 13-nation group\nlast December to back new official oil prices averaging 18 dlrs\na barrel.\n The sources said the two-fold jump in exports last week\nappeared to be the result of buyers rushing to lift February\nentitlements before the month-end.\n Last weeks high export levels appeared to show continued\nsupport for official Opec prices from Saudi Arabias main crude\ncustomers, the four ex-partners of Aramco, the sources said.\n The four -- Exxon Corp <XON>, Mobil Corp <MOB>, Texaco Inc\n<TX> and Chevron Corp <CHV> -- signed a long-term agreement\nlast month to buy Saudi crude for 17.52 dlrs a barrel.\n However the sources said the real test of Saudi Arabias\nability to sell crude at official prices in a weak market will\ncome this month, when demand for petroleum products\ntraditionally tapers off. Spot prices have fallen in recent\nweeks to more than one dlr below Opec levels.\n Saudi Arabian oil minister Hisham Nazer yesterday\nreiterated the kingdoms commitment to the December OPEC accord\nand said it would never sell below official prices.\n The sources said total Saudi refinery throughput fell\nslightly in February to an average 1.1 mln bpd from 1.2 mln in\nJanuary because of cuts at the Yanbu and Jubail export\nrefineries.\n They put crude oil exports through Yanbu at 100,000 bpd\nlast month, compared to zero in January, while throughput at\nBahrains refinery and neutral zone production remained steady\nat around 200,000 bpd each.\n REUTER')
crudeText <- c(crudeText, 'Deputy oil ministers from six Gulf\nArab states will meet in Bahrain today to discuss coordination\nof crude oil marketing, the official Emirates news agency WAM\nreported.\n WAM said the officials would be discussing implementation\nof last Sundays agreement in Doha by Gulf Cooperation Council\n(GCC) oil ministers to help each other market their crude oil.\n Four of the GCC states - Saudi Arabia, the United Arab\nEmirates (UAE), Kuwait and Qatar - are members of the\nOrganiaation of Petroleum Exporting Countries (OPEC) and some\nface stiff buyer resistance to official OPEC prices.\n Reuter')
crudeText <- c(crudeText, 'Saudi Arabian Oil Minister Hisham Nazer\nreiterated the kingdoms commitment to last Decembers OPEC\naccord to boost world oil prices and stabilize the market, the\nofficial Saudi Press Agency SPA said.\n Asked by the agency about the recent fall in free market\noil prices, Nazer said Saudi Arabia \"is fully adhering by the\n... accord and it will never sell its oil at prices below the\npronounced prices under any circumstance.\"\n Saudi Arabia was a main architect of December pact under\nwhich OPEC agreed to cut its total oil output ceiling by 7.25\npct and return to fixed prices of around 18 dollars a barrel.\n Reuter')
crudeText <- c(crudeText, 'Kuwaits oil minister said in a newspaper\ninterview that there were no plans for an emergency OPEC\nmeeting after the recent weakness in world oil prices.\n Sheikh Ali al-Khalifa al-Sabah was quoted by the local\ndaily al-Qabas as saying that \"none of the OPEC members has\nasked for such a meeting.\"\n He also denied that Kuwait was pumping above its OPEC quota\nof 948,000 barrels of crude daily (bpd).\n Crude oil prices fell sharply last week as international\noil traders and analysts estimated the 13-nation OPEC was\npumping up to one million bpd over its self-imposed limits.\n Reuter')
crudeText <- c(crudeText, 'The port of Philadelphia was closed\nwhen a Cypriot oil tanker, Seapride II, ran aground after\nhitting a 200-foot tower supporting power lines across the\nriver, a Coast Guard spokesman said.\n He said there was no oil spill but the ship is lodged on\nrocks opposite the Hope Creek nuclear power plant in New\nJersey.\n He said the port would be closed until today when they\nhoped to refloat the ship on the high tide.\n After delivering oil to a refinery in Paulsboro, New\nJersey, the ship apparently lost its steering and hit the power\ntransmission line carrying power from the nuclear plant to the\nstate of Delaware.\n Reuter')
crudeText <- c(crudeText, 'A study group said the United States\nshould increase its strategic petroleum reserve to one mln\nbarrels as one way to deal with the present and future impact\nof low oil prices on the domestic oil industry.\n U.S. policy now is to raise the strategic reserve to 750\nmln barrels, from its present 500 mln, to help protect the\neconomy from an overseas embargo or a sharp price rise.\n The Aspen Institute for Humanistic Studies, a private\ngroup, also called for new research for oil exploration and\ndevelopment techniques.\n It predicted prices would remain at about 15-18 dlrs a\nbarrel for several years and then rise to the mid 20s, with\nimports at about 30 pct of U.S. consumption.\n It said instead that such moves as increasing oil reserves\nand more exploration and development research would help to\nguard against or mitigate the risks of increased imports.\n Reuter')
crudeText <- c(crudeText, 'A study group said the United States\nshould increase its strategic petroleum reserve to one mln\nbarrels as one way to deal with the present and future impact\nof low oil prices on the domestic oil industry.\n U.S. policy now is to raise the strategic reserve to 750\nmln barrels, from its present 500 mln, to help protect the\neconomy from an overseas embargo or a sharp price rise.\n The Aspen Institute for Humanistic Studies, a private\ngroup, also called for new research for oil exploration and\ndevelopment techniques.\n It predicted prices would remain at about 15-18 dlrs a\nbarrel for several years and then rise to the mid 20s, with\nimports at about 30 pct of U.S. consumption.\n The study cited two basic policy paths for the nation: to\nprotect the U.S. industry through an import fee or other such\ndevice or to accept the full economic benefits of cheap oil.\n But the group did not strongly back either option, saying\nthere were benefits and drawbacks to both.\n It said instead that such moves as increasing oil reserves\nand more exploration and development research would help to\nguard against or mitigate the risks of increased imports.\n Reuter')
crudeText <- c(crudeText, 'Unocal Corps Union Oil Co said it\nlowered its posted prices for crude oil one to 1.50 dlrs a\nbarrel in the eastern region of the U.S., effective Feb 26.\n Union said a 1.50 dlrs cut brings its posted price for the\nU.S. benchmark grade, West Texas Intermediate, to 16 dlrs.\nLouisiana Sweet also was lowered 1.50 dlrs to 16.35 dlrs, the\ncompany said.\n No changes were made in Unions posted prices for West\nCoast grades of crude oil, the company said.\n Reuter')
crudeText <- c(crudeText, 'The New York Mercantile Exchange set\nApril one for the debut of a new procedure in the energy\ncomplex that will increase the use of energy futures worldwide.\n On April one, NYMEX will allow oil traders that do not\nhold a futures position to initiate, after the exchange closes,\na transaction that can subsequently be hedged in the futures\nmarket, according to an exchange spokeswoman.\n \"This will change the way oil is transacted in the real\nworld,\" said said Thomas McKiernan, McKiernan and Co chairman.\n Foreign traders will be able to hedge trades against NYMEX\nprices before the exchange opens and negotiate prices at a\ndifferential to NYMEX prices, McKiernan explained.\n The expanded program \"will serve the industry because the\noil market does not close when NYMEX does,\" said Frank Capozza,\nsecretary of Century Resources Inc.\n The rule change, which has already taken effect for\nplatinum futures on NYMEX, is expected to increase the open\ninterest and liquidity in U.S. energy futures, according to\ntraders and analysts.\n Currently, at least one trader in this transaction, called\nan exchange for physical or EFP, must hold a futures position\nbefore entering into the transaction.\n Under the new arrangement, neither party has to hold a\nfutures position before entering into an EFP and one or both\nparties can offset their cash transaction with a futures\ncontract the next day, according to exchange officials.\n When NYMEX announced its proposed rule change in December,\nNYMEX President Rosemary McFadden, said, \"Expansion of the EFP\nprovision will add to globalization of the energy markets by\nproviding for, in effect, 24-hour trading.\"\n The Commodity Futures Trading Commission approved the rule\nchange in February, according to a CFTC spokeswoman.\n Reuter')
crudeText <- c(crudeText, 'Argentine crude oil production was\ndown 10.8 pct in January 1987 to 12.32 mln barrels, from 13.81\nmln barrels in January 1986, Yacimientos Petroliferos Fiscales\nsaid.\n January 1987 natural gas output totalled 1.15 billion cubic\nmetrers, 3.6 pct higher than 1.11 billion cubic metres produced\nin January 1986, Yacimientos Petroliferos Fiscales added.\n Reuter')
crude <- tm::VCorpus(tm::VectorSource(crudeText))
NLP::meta(crude, "id") <- c('127', '144', '191', '194', '211', '236', '237', '242', '246', '248', '273', '349', '352', '353', '368', '489', '502', '543', '704', '708')
# Print out the corpus
print(crude)
# Print the content of the 10th article
crude[[10]]$content
# Find the first ID
crude[[1]]$meta$id
# Make a vector of IDs
ids <- c()
for(i in c(1:20)){
ids <- append(ids, crude[[i]]$meta$id)
}
# Create a tibble & Review
crude_tibble <- generics::tidy(crude)
names(crude_tibble)
crude_counts <- crude_tibble %>%
# Tokenize
tidytext::unnest_tokens(word, text) %>%
# Count by word
count(word, sort = TRUE) %>%
# Remove
anti_join(tidytext::stop_words)
# Assign the top word
top_word <- "oil"
russian_tweets <- read_csv("./RInputFiles/russian_1.csv")
str(russian_tweets)
# Create a corpus
tweet_corpus <- tm::VCorpus(tm::VectorSource(russian_tweets$content))
# Attach following and followers
NLP::meta(tweet_corpus, 'following') <- russian_tweets$following
NLP::meta(tweet_corpus, 'followers') <- russian_tweets$followers
# Review the meta data
head(NLP::meta(tweet_corpus))
# Count occurrence by question and word
words <- crude_tibble %>%
tidytext::unnest_tokens(output = "word", token = "words", input = text) %>%
anti_join(tidytext::stop_words) %>%
count(id, word, sort=TRUE)
# How different word/article combinations are there?
unique_combinations <- nrow(words)
# Filter to responses with the word "prices"
words %>%
filter(word == "prices")
# How many articles had the word "prices"?
number_of_price_articles <- 15
# Tokenize and remove stop words
tidy_tweets <- russian_tweets %>%
tidytext::unnest_tokens(word, content) %>%
anti_join(tidytext::stop_words)
# Count by word
unique_words <- tidy_tweets %>%
count(word)
# Count by tweet (tweet_id) and word
unique_words_by_tweet <- tidy_tweets %>%
count(tweet_id, word)
# Find the size of matrix: rows x columns
size <- nrow(russian_tweets) * length(unique(tidy_tweets$word))
percent <- nrow(unique_words_by_tweet) / size
percent
# Create a tibble with TFIDF values
crude_weights <- crude_tibble %>%
tidytext::unnest_tokens(output = "word", token = "words", input = text) %>%
anti_join(tidytext::stop_words) %>%
count(id, word) %>%
tidytext::bind_tf_idf(word, id, n)
# Find the highest TFIDF values
crude_weights %>%
arrange(desc(tf_idf))
# Find the lowest non-zero TFIDF values
crude_weights %>%
filter(tf_idf != 0) %>%
arrange(tf_idf)
# Create word counts
animal_farm_counts <- animal_farm %>%
tidytext::unnest_tokens(word, text_column) %>%
count(chapter, word)
# Calculate the cosine similarity
comparisons <- animal_farm_counts %>%
widyr::pairwise_similarity(chapter, word, n) %>%
arrange(desc(similarity))
# Print the mean of the similarity values
comparisons %>%
summarize(mean = mean(similarity)) # very high similarities due to stop words
# Create word counts
animal_farm_counts <- animal_farm %>%
tidytext::unnest_tokens(word, text_column) %>%
anti_join(tidytext::stop_words) %>%
count(chapter, word) %>%
tidytext::bind_tf_idf(chapter, word, n)
# Calculate cosine similarity on word counts
animal_farm_counts %>%
widyr::pairwise_similarity(chapter, word, n) %>%
arrange(desc(similarity))
# Calculate cosine similarity using tf_idf values
animal_farm_counts %>%
widyr::pairwise_similarity(chapter, word, tf_idf) %>%
arrange(desc(similarity))
Chapter 3 - Applications: Classification and Topic Modeling
Preparing Text for Modeling:
Classification Modeling:
Introduction to Topic Modeling:
LDA in Practice:
Example code includes:
# Stem the tokens
russian_tokens <- russian_tweets %>%
tidytext::unnest_tokens(output = "word", token = "words", input = content) %>%
anti_join(tidytext::stop_words) %>%
mutate(word = SnowballC::wordStem(word))
# Create a document term matrix
tweet_matrix <- russian_tokens %>%
count(tweet_id, word) %>%
tidytext::cast_dtm(document = tweet_id, term = word, value = n, weighting = tm::weightTfIdf)
# Print the matrix details
tweet_matrix
less_sparse_matrix <- tm::removeSparseTerms(tweet_matrix, sparse = 0.5)
# Print results
tweet_matrix
less_sparse_matrix
less_sparse_matrix <- tm::removeSparseTerms(tweet_matrix, sparse = 0.9)
# Print results
tweet_matrix
less_sparse_matrix
less_sparse_matrix <- tm::removeSparseTerms(tweet_matrix, sparse = 0.99)
# Print results
tweet_matrix
less_sparse_matrix
less_sparse_matrix <- tm::removeSparseTerms(tweet_matrix, sparse =0.9999)
# Print results
tweet_matrix
less_sparse_matrix
set.seed(2001021530)
rightTweet <- russian_tweets %>%
filter(account_type=="Right") %>%
sample_n(2000)
leftTweet <- russian_tweets %>%
filter(account_type=="Left") %>%
sample_n(2000)
idx <- sample(1:4000, 4000, replace=FALSE)
leftRightData <- rbind(rightTweet, leftTweet)[idx, ]
leftRight_tokens <- leftRightData %>%
tidytext::unnest_tokens(output = "word", token = "words", input = content) %>%
anti_join(tidytext::stop_words) %>%
mutate(word = SnowballC::wordStem(word))
# Create a document term matrix
left_right_matrix_small <- leftRight_tokens %>%
count(tweet_id, word) %>%
tidytext::cast_dtm(document = tweet_id, term = word, value = n, weighting = tm::weightTfIdf) %>%
tm::removeSparseTerms(sparse = 0.99)
left_right_labels <- c()
for (lbl in rownames(as.matrix(left_right_matrix_small))) {
newPoint <- leftRightData %>%
filter(tweet_id==lbl) %>%
pull(account_type)
left_right_labels <- c(left_right_labels, newPoint)
}
left_right_labels <- as.factor(left_right_labels)
# Create train/test split
set.seed(1111)
sample_size <- floor(0.75 * nrow(left_right_matrix_small))
train_ind <- sample(nrow(left_right_matrix_small), size = sample_size)
train <- left_right_matrix_small[train_ind, ]
test <- left_right_matrix_small[-train_ind, ]
# Create a random forest classifier
rfc <- randomForest::randomForest(x = as.data.frame(as.matrix(train)),
y = left_right_labels[train_ind], nTree = 50
)
# Print the results
rfc
# Percentage correctly labeled "Left"
# left <- (350) / (350 + 157)
# left
# Percentage correctly labeled "Right"
# right <- (436) / (436 + 57)
# right
# Overall Accuracy:
# accuracy <- (350 + 436) / (350 + 436 + 57 + 157)
# accuracy
napolSents <- animal_farm %>%
tidytext::unnest_tokens(output = "sentences", input = text_column, token = "sentences") %>%
mutate(sentence_id=row_number(), napoleon=str_detect(sentences, 'napoleon')) %>%
filter(napoleon)
pig_tokens <- napolSents %>%
tidytext::unnest_tokens(output = "word", token = "words", input = sentences) %>%
anti_join(tidytext::stop_words) %>%
mutate(word = SnowballC::wordStem(word))
# Create a document term matrix
pig_matrix <- pig_tokens %>%
count(sentence_id, word) %>%
tidytext::cast_dtm(document = sentence_id, term = word, value = n, weighting = tm::weightTf) %>%
tm::removeSparseTerms(sparse=0.995)
# Perform Topic Modeling
sentence_lda <-
topicmodels::LDA(pig_matrix, k = 10, method = 'Gibbs', control = list(seed = 1111))
# Extract the beta matrix
sentence_betas <- generics::tidy(sentence_lda, matrix = "beta")
# Topic #2
sentence_betas %>%
filter(topic == 2) %>%
arrange(-beta)
# Topic #10
sentence_betas %>%
filter(topic == 3) %>%
arrange(-beta)
# Extract the beta and gamma matrices
sentence_betas <- generics::tidy(sentence_lda, matrix = "beta")
sentence_gammas <- generics::tidy(sentence_lda, matrix = "gamma")
# Explore Topic 5 Betas
sentence_betas %>%
filter(topic == 5) %>%
arrange(-beta)
# Explore Topic 5 Gammas
sentence_gammas %>%
filter(topic == 5) %>%
arrange(-gamma)
# Print the topic setence for topic 5
napolSents$sentences[which(napolSents$sentence_id == (sentence_gammas %>% group_by(topic) %>%
top_n(1, gamma) %>% filter(topic==5) %>%
pull(document) %>% as.numeric()
)
)
]
right_tokens <- rightTweet %>%
tidytext::unnest_tokens(output = "word", token = "words", input = content) %>%
anti_join(tidytext::stop_words) %>%
mutate(word = SnowballC::wordStem(word))
# Create a document term matrix
right_matrix <- right_tokens %>%
count(tweet_id, word) %>%
tidytext::cast_dtm(document = tweet_id, term = word, value = n, weighting = tm::weightTf)
# Setup train and test data
sample_size <- floor(0.90 * nrow(right_matrix))
set.seed(1111)
train_ind <- sample(nrow(right_matrix), size = sample_size)
train <- right_matrix[train_ind, ]
test <- right_matrix[-train_ind, ]
# Peform topic modeling
lda_model <- topicmodels::LDA(train, k = 5, method = "Gibbs",control = list(seed = 1111))
# Train
topicmodels::perplexity(lda_model, newdata = train)
# Test
topicmodels::perplexity(lda_model, newdata = test)
# Extract the gamma matrix
gamma_values <- generics::tidy(sentence_lda, matrix = "gamma")
# Create grouped gamma tibble
grouped_gammas <- gamma_values %>%
group_by(document) %>%
arrange(desc(gamma)) %>%
slice(1) %>%
group_by(topic)
# Count by topic
grouped_gammas %>%
tally(topic, sort=TRUE)
# Average topic weight for top topic for each sentence
grouped_gammas %>%
summarize(avg=mean(gamma)) %>%
arrange(desc(avg))
Chapter 4 - Advanced Techniques
Sentiment Analysis:
Word Embeddings:
Additional NLP Analysis:
Wrap Up:
Example code includes:
# Print the lexicon
tidytext::get_sentiments("bing")
# Count the different sentiment types
tidytext::get_sentiments("bing") %>%
count(sentiment) %>%
arrange(desc(n))
# Count the different sentiment types
tidytext::get_sentiments("loughran") %>%
count(sentiment) %>%
arrange(desc(n))
# Count how many times each score was used
tidytext::get_sentiments("afinn") %>%
count(value) %>%
arrange(desc(n))
afSents <- animal_farm %>%
tidytext::unnest_tokens(output = "sentence", input = text_column, token = "sentences") %>%
mutate(sentence_id=row_number())
# Print the overall sentiment associated with each pig's sentences
for(name in c("napoleon", "snowball", "squealer")) {
# Filter to the sentences mentioning the pig
pig_sentences <- afSents[grepl(name, afSents$sentence), ]
# Tokenize the text
temp_tokens <- pig_sentences %>%
tidytext::unnest_tokens(output = "word", token = "words", input = sentence) %>%
anti_join(tidytext::stop_words)
# Use afinn to find the overall sentiment score
result <- temp_tokens %>%
inner_join(tidytext::get_sentiments("afinn")) %>%
summarise(sentiment = sum(value))
# Print the result
print(paste0(name, ": ", result$sentiment))
}
left_tokens <- russian_tweets %>%
filter(account_type=="Left") %>%
tidytext::unnest_tokens(output = "word", token = "words", input = content) %>%
anti_join(tidytext::stop_words)
# Dictionaries
# anticipation <- tidytext::get_sentiments("bing") %>%
# filter(sentiment == "anticipation")
# joy <- tidytext::get_sentiments("nrc") %>%
# filter(sentiment == "joy")
# Print top words for Anticipation and Joy
# left_tokens %>%
# inner_join(anticipation, by = "word") %>%
# count(word, sort = TRUE)
# left_tokens %>%
# inner_join(joy, by = "word") %>%
# count(word, sort = TRUE)
# Initialize a h2o session
library(h2o)
h2o.init()
# Create an h2o object for left_right
h2o_object = as.h2o(leftRightData)
# Tokenize the words from the column of text in left_right
tweet_words <- h2o.tokenize(h2o_object$content, "\\\\W+")
# Lowercase and remove stopwords
tweet_words <- h2o.tolower(tweet_words)
tweet_words = tweet_words[is.na(tweet_words) || (!tweet_words %in% tidytext::stop_words$word),]
tweet_words
# set.seed(1111)
# Use 33% of the available data
# sample_size <- floor(0.33 * nrow(job_titles))
# sample_data <- sample(nrow(job_titles), size = sample_size)
# h2o_object = as.h2o(job_titles[sample_data, ])
# words <- h2o.tokenize(h2o_object$jobtitle, "\\\\W+")
# words <- h2o.tolower(words)
# words = words[is.na(words) || (!words %in% stop_words$word),]
# word2vec_model <- h2o.word2vec(words, min_word_freq=5, epochs = 10)
# Find synonyms for the word "teacher"
# h2o.findSynonyms(word2vec_model, "teacher", count=10)
# a: Labels each word within text as either a noun, verb, adjective, or other category.
# b: A model pre-trained on a vast amount of text data to create a language representation used for supervised learning.
# c: A type of analysis that looks to describe text as either positive or negative and can be used to find active vs passive terms.
# d: A modeling technique used to label entire text into a single category such as relevant or not-relevant.
# Sentiment Analysis
# SA <- c
# Classifcation Modeling
# CM <- d
# BERT
# BERT <- b
# Part-of-speech Tagging
# POS <- a
# e: Modeling techniques, including LDA, used to cluster text into groups or types based on similar words being used.
# f: A method for searching through text and tagging words that distinguish people, locations, or organizations.
# g: Method used to search text for specific patterns.
# h: Representing words using a large vector space where similar words are close together within the vector space.
# Named Entity Recognition
# NER <- f
# Topic Modeling
# TM <- e
# Word Embeddings
# WE <- h
# Regular Expressions
# REGEX <- g
Chapter 1 - Joining Tables
The inner_join verb:
Joining with a one-to-many relationship:
Joining three or more tables:
Example code includes:
parts <- readRDS("./RInputFiles/parts.rds")
part_categories <- readRDS("./RInputFiles/part_categories.rds")
inventory_parts <- readRDS("./RInputFiles/inventory_parts.rds")
inventories <- readRDS("./RInputFiles/inventories.rds")
sets <- readRDS("./RInputFiles/sets.rds")
themes <- readRDS("./RInputFiles/themes.rds")
colors <- readRDS("./RInputFiles/colors.rds")
# Use the suffix argument to replace .x and .y suffixes
parts %>%
inner_join(part_categories, by = c("part_cat_id" = "id"), suffix=c("_part", "_category"))
# Combine the parts and inventory_parts tables
parts %>%
inner_join(inventory_parts, by=c("part_num"))
# Combine the parts and inventory_parts tables
inventory_parts %>%
inner_join(parts, by="part_num")
sets %>%
# Add inventories using an inner join
inner_join(inventories, by="set_num") %>%
# Add inventory_parts using an inner join
inner_join(inventory_parts, by=c("id"="inventory_id"))
# Count the number of colors and sort
sets %>%
inner_join(inventories, by = "set_num") %>%
inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
inner_join(colors, by = c("color_id" = "id"), suffix = c("_set", "_color")) %>%
count(name_color, sort=TRUE)
Chapter 2 - Left and Right Joins
The left_join verb:
The right_join verb:
Joining tables to themselves:
Example code includes:
inventory_parts_joined <- inventory_parts %>%
inner_join(inventories, by=c("inventory_id"="id")) %>%
select(set_num, part_num, color_id, quantity)
str(inventory_parts_joined)
millennium_falcon <- inventory_parts_joined %>%
filter(set_num == "7965-1")
str(millennium_falcon)
star_destroyer <- inventory_parts_joined %>%
filter(set_num == "75190-1")
str(star_destroyer)
# Combine the star_destroyer and millennium_falcon tables
millennium_falcon %>%
left_join(star_destroyer, by=c("part_num", "color_id"), suffix=c("_falcon", "_star_destroyer"))
# Aggregate Millennium Falcon for the total quantity in each part
millennium_falcon_colors <- millennium_falcon %>%
group_by(color_id) %>%
summarize(total_quantity = sum(quantity))
# Aggregate Star Destroyer for the total quantity in each part
star_destroyer_colors <- star_destroyer %>%
group_by(color_id) %>%
summarize(total_quantity = sum(quantity))
# Left join the Millennium Falcon colors to the Star Destroyer colors
millennium_falcon_colors %>%
left_join(star_destroyer_colors, by="color_id", suffix=c("_falcon", "_star_destroyer"))
inventory_version_1 <- inventories %>%
filter(version == 1)
# Join versions to sets
sets %>%
left_join(inventory_version_1, by="set_num") %>%
# Filter for where version is na
filter(is.na(version))
parts %>%
count(part_cat_id) %>%
right_join(part_categories, by = c("part_cat_id" = "id")) %>%
# Filter for NA
filter(is.na(n))
parts %>%
count(part_cat_id) %>%
right_join(part_categories, by = c("part_cat_id" = "id")) %>%
# Use replace_na to replace missing values in the n column
replace_na(list(n=0))
themes %>%
# Inner join the themes table
inner_join(themes, by=c("id"="parent_id"), suffix=c("_parent", "_child")) %>%
# Filter for the "Harry Potter" parent name
filter(name_parent=="Harry Potter")
# Join themes to itself again to find the grandchild relationships
themes %>%
inner_join(themes, by = c("id" = "parent_id"), suffix = c("_parent", "_child")) %>%
inner_join(themes, by = c("id_child" = "parent_id"), suffix = c("_parent", "_grandchild"))
themes %>%
# Left join the themes table to its own children
left_join(themes, by=c("id"="parent_id"), suffix=c("_parent", "_child")) %>%
# Filter for themes that have no child themes
filter(is.na(id_child))
Chapter 3 - Full, Semi, and Anti Joins
The full_join verb:
The semi and anti-join verbs:
Visualizing set differences:
Example code includes:
inventory_parts_joined <- inventories %>%
inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
arrange(desc(quantity)) %>%
select(-id, -version)
str(inventory_parts_joined)
inventory_parts_joined %>%
# Combine the sets table with inventory_parts_joined
inner_join(sets, by=c("set_num"="set_num")) %>%
# Combine the themes table with your first join
inner_join(themes, by=c("theme_id"="id"), suffix=c("_set", "_theme"))
inventory_sets_themes <- inventory_parts_joined %>%
inner_join(sets, by = "set_num") %>%
inner_join(themes, by = c("theme_id" = "id"), suffix = c("_set", "_theme"))
str(inventory_sets_themes)
batman <- inventory_sets_themes %>%
filter(name_theme == "Batman")
str(batman)
star_wars <- inventory_sets_themes %>%
filter(name_theme == "Star Wars")
str(star_wars)
# Count the part number and color id, weight by quantity
(batman_parts <- batman %>%
count(part_num, color_id, wt=quantity))
(star_wars_parts <- star_wars %>%
count(part_num, color_id, wt=quantity))
(parts_joined <- batman_parts %>%
# Combine the star_wars_parts table
full_join(star_wars_parts, by=c("part_num", "color_id"), suffix=c("_batman", "_star_wars")) %>%
# Replace NAs with 0s in the n_batman and n_star_wars columns
replace_na(list(n_batman=0, n_star_wars=0)))
parts_joined %>%
# Sort the number of star wars pieces in descending order
arrange(-n_star_wars) %>%
# Join the colors table to the parts_joined table
left_join(colors, by=c("color_id"="id")) %>%
# Join the parts table to the previous join
left_join(parts, by=c("part_num"), suffix=c("_color", "_part"))
batmobile <- inventory_parts_joined %>%
filter(set_num == "7784-1") %>%
select(-set_num)
str(batmobile)
batwing <- inventory_parts_joined %>%
filter(set_num == "70916-1") %>%
select(-set_num)
str(batwing)
# Filter the batwing set for parts that are also in the batmobile set
batwing %>%
semi_join(batmobile, by=c("part_num"))
# Filter the batwing set for parts that aren't in the batmobile set
batwing %>%
anti_join(batmobile, by=c("part_num"))
# Use inventory_parts to find colors included in at least one set
colors %>%
semi_join(inventory_parts, by=c("id"="color_id"))
# Use filter() to extract version 1
version_1_inventories <- inventories %>%
filter(version==1)
# Use anti_join() to find which set is missing a version 1
sets %>%
anti_join(version_1_inventories, by=c("set_num"))
(inventory_parts_themes <- inventories %>%
inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
arrange(desc(quantity)) %>%
select(-id, -version) %>%
inner_join(sets, by = "set_num") %>%
inner_join(themes, by = c("theme_id" = "id"), suffix = c("_set", "_theme")))
batman_colors <- inventory_parts_themes %>%
# Filter the inventory_parts_themes table for the Batman theme
filter(name_theme=="Batman") %>%
group_by(color_id) %>%
summarize(total = sum(quantity)) %>%
# Add a percent column of the total divided by the sum of the total
mutate(percent=total/sum(total))
# Filter and aggregate the Star Wars set data; add a percent column
star_wars_colors <- inventory_parts_themes %>%
filter(name_theme=="Star Wars") %>%
group_by(color_id) %>%
summarize(total = sum(quantity)) %>%
mutate(percent=total/sum(total))
(colors_joined <- batman_colors %>%
full_join(star_wars_colors, by = "color_id", suffix = c("_batman", "_star_wars")) %>%
replace_na(list(total_batman = 0, total_star_wars = 0, percent_batman=0, percent_star_wars=0)) %>%
inner_join(colors, by = c("color_id" = "id")) %>%
# Create the difference and total columns
mutate(difference = percent_batman - percent_star_wars, total = total_batman + total_star_wars) %>%
# Filter for totals greater than 200
filter(total >= 200))
color_palette <- c('#05131D', '#0055BF', '#C91A09', '#F2CD37', '#FFFFFF', '#E4CD9E', '#958A73', '#C91A09', '#F5CD2F', '#582A12', '#A0A5A9', '#6C6E68', '#CC702A', '#898788', '#A0BCAC', '#D3D3D3')
names(color_palette) <- c('Black', 'Blue', 'Red', 'Yellow', 'White', 'Tan', 'Dark Tan', 'Trans-Red', 'Trans-Yellow', 'Reddish Brown', 'Light Bluish Gray', 'Dark Bluish Gray', 'Medium Dark Flesh', 'Flat Silver', 'Sand Green', 'Light Gray')
color_palette
# Create a bar plot using colors_joined and the name and difference columns
ggplot(colors_joined, aes(x=reorder(name, difference), y=difference, fill = name)) +
geom_col() +
coord_flip() +
scale_fill_manual(values = color_palette, guide = FALSE) +
labs(y = "Difference: Batman - Star Wars")
Chapter 4 - Case Study: Stack Overflow
Stack Overflow Questions:
Joining Questions and Answers:
The bind_rows verb:
Wrap up:
Example code includes:
questions <- readRDS("./RInputFiles/questions.rds")
tags <- readRDS("./RInputFiles/tags.rds")
question_tags <- readRDS("./RInputFiles/question_tags.rds")
answers <- readRDS("./RInputFiles/answers.rds")
# Replace the NAs in the tag_name column
questions_with_tags <- questions %>%
left_join(question_tags, by = c("id" = "question_id")) %>%
left_join(tags, by = c("tag_id" = "id")) %>%
replace_na(list(tag_name="only-r"))
questions_with_tags %>%
# Group by tag_name
group_by(tag_name) %>%
# Get mean score and num_questions
summarize(score = mean(score), num_questions = n()) %>%
# Sort num_questions in descending order
arrange(-num_questions)
# Using a join, filter for tags that are never on an R question
tags %>%
anti_join(question_tags, by=c("id"="tag_id"))
questions %>%
# Inner join questions and answers with proper suffixes
inner_join(answers, by=c("id"="question_id"), suffix=c("_question", "_answer")) %>%
# Subtract creation_date_question from creation_date_answer to create gap
mutate(gap = as.integer(creation_date_answer-creation_date_question))
# Count and sort the question id column in the answers table
answer_counts <- answers %>%
count(question_id, sort=TRUE)
# Combine the answer_counts and questions tables
question_answer_counts <- questions %>%
left_join(answer_counts, by=c("id"="question_id")) %>%
# Replace the NAs in the n column
replace_na(list(n=0))
tagged_answers <- question_answer_counts %>%
# Join the question_tags tables
inner_join(question_tags, by=c("id"="question_id")) %>%
# Join the tags table
inner_join(tags, by=c("tag_id"="id"))
tagged_answers %>%
# Aggregate by tag_name
group_by(tag_name) %>%
# Summarize questions and average_answers
summarize(questions = n(), average_answers = mean(n)) %>%
# Sort the questions in descending order
arrange(-questions)
# Inner join the question_tags and tags tables with the questions table
questions_with_tags <- questions %>%
inner_join(question_tags, by = c("id"="question_id")) %>%
inner_join(tags, by = c("tag_id"="id"))
# Inner join the question_tags and tags tables with the answers table
answers_with_tags <- answers %>%
inner_join(question_tags, by = c("question_id"="question_id")) %>%
inner_join(tags, by = c("tag_id"="id"))
# Combine the two tables into posts_with_tags
posts_with_tags <- bind_rows(questions_with_tags %>% mutate(type = "question"), answers_with_tags %>% mutate(type = "answer"))
# Add a year column, then aggregate by type, year, and tag_name
by_type_year_tag <- posts_with_tags %>%
mutate(year=lubridate::year(creation_date)) %>%
count(type, year, tag_name)
# Filter for the dplyr and ggplot2 tag names
by_type_year_tag_filtered <- by_type_year_tag %>%
filter(tag_name %in% c("dplyr", "ggplot2"))
# Create a line plot faceted by the tag name
ggplot(by_type_year_tag_filtered, aes(x=year, y=n, color = type)) +
geom_line() +
facet_wrap(~ tag_name)
Chapter 1 - Introducing TensorFlow in R
What is TensorFlow?
TensorFlow Syntax, Variables, and Placeholders:
TensorBoard - Visualizing TensorFlow Models:
Example code includes:
# Miniconda has been successfully installed at "C:/.../AppData/Local/r-miniconda".
# Need to install and PATH tensorflow for this to work
library(tensorflow)
# Create your session
sess <- tf$Session()
# Define a constant (you'll learn this next!)
HiThere <- tf$constant('Hi DataCamp Student!')
# Run your session with the HiThere constant
print(sess$run(HiThere))
# Close the session
sess$close()
# Create two constant tensors
myfirstconstanttensor <- tf$constant(152)
mysecondconstanttensor <- tf$constant('I am a tensor master!')
# Create a matrix of zeros
myfirstvariabletensor <- tf$Variable(tf$zeros(shape(5, 1)))
# Set up your session
EmployeeSession <- tf$Session()
# Add your constants
female <- tf$constant(150, name = "FemaleEmployees")
male <- tf$constant(135, name = "MaleEmployees")
total <- tf$add(female, male)
print(EmployeeSession$run(total))
# Write to file
towrite <- tf$summary$FileWriter('./graphs', EmployeeSession$graph)
# Open Tensorboard
tensorboard(log_dir = './graphs')
# From last exercise
total <- tf$add(female,male)
# Multiply your allemps by growth projections
growth <- tf$constant(1.32, name = "EmpGrowth")
EmpGrowth <- tf$math$multiply(total, growth)
print(EmployeeSession$run(EmpGrowth))
# Write to file
towrite <- tf$summary$FileWriter('./graphs', EmployeeSession$graph)
# Open Tensorboard
tensorboard(log_dir = './graphs')
# Start Session
sess <- tf$Session()
# Create 2 constants
a <- tf$constant(10)
b <- tf$constant(32)
# Add your two constants together
sess$run(a + b)
# Create a Variable
mytestvariable <- tf$Variable(tf$zeros(shape(1L)))
# Run the last line
mytestvariable
Chapter 2 - Linear Regression Using Two TensorFlow API
Core API: Linear Regression:
Core API: Linear Regression Part II:
Core API: Linear Regression Part III:
sess$run(train) if (step %% 500 == 0) cat("Step = ", step, "Estimate w = ", sess$run(w), "Estimate b = ", sess$run(b)) Estimators API: Multiple Linear Regression:
Example code includes:
# Parse out the minimum study time and final percent in x_data and y_data variables
x_data <- studentgradeprediction_train$minstudytime
y_data <- studentgradeprediction_train$Finalpercent
# Define your w variable
w <- tf$Variable(tf$random_uniform(shape(1L), -1.0, 1.0))
# Define your b variable
b <- tf$Variable(tf$zeros(shape(1L)))
# Define your linear equation
y <- w * x_data + b
# Define cost function
loss <- tf$reduce_mean((y-y_data)^2)
# Use the Gradient Descent Optimizer
optimizer <- tf$train$GradientDescentOptimizer(0.0001)
# Minimize MSE loss
train <- optimizer$minimize(loss)
# Launch new session
Finalgradessession <- tf$Session()
# Initialize (run) global variables
Finalgradessession$run(tf$global_variables_initializer())
# Train your model
for (step in 1:3750) {
Finalgradessession$run(train)
if (step %% 750 == 0) cat("Step = ", step, "Estimate w = ", Finalgradessession$run(w), "Estimate b =", Finalgradessession$run(b), "\n")
}
# Calculate the predicted grades
grades_actual <- studentgradeprediction_test$Finalpercent
grades_predicted <- as.vector(Finalgradessession$run(w)) *
studentgradeprediction_test$minstudytime +
as.vector(Finalgradessession$run(b))
# Plot the actual and predicted grades
plot(grades_actual, grades_predicted, pch=19, col='red')
# Run a correlation
cor(grades_actual, grades_predicted)
# Define all four of your feature columns
ftr_colns <- feature_columns(
)
# Choose the correct model
grademodel <- linear_regressor(feature_columns = ftr_colns)
# Define your input function
grade_input_fn <- function(data){
}
# Train your model
train(grademodel, grade_input_fn(train))
# Evaluate your model
model_eval <- evaluate(grademodel, grade_input_fn(test))
# See the results
model_eval
# Calculate the predictions
predictoutput <- predict(grademodel, input_fn=grademodel_input_fn(studentgradeprediction_test))
# Plot actual and predicted values
plot(studentgradeprediction_test$Finalpercent, as.numeric(predictoutput$predictions),
xlab = "actual_grades", ylab = "predicted_grades", pch=19, col='red'
)
# Calculate the correlation
cor(as.numeric(predictoutput$predictions), studentgradeprediction_test$Finalpercent)
Chapter 3 - Deep Learning in TensorFlow: Creating a Deep Neural Network
Gentle Introduction to Neural Networks:
Deep Neural Networks Using Keras API:
Evaluate, Predict, Visualize Model:
Create DNN Using Estimators API:
Example code includes:
# Define the model
model <- keras_model_sequential()
model %>%
layer_dense(units=15, activation = 'relu', input_shape = 8) %>%
layer_dense(units=5, activation = 'relu') %>%
layer_dense(units=1)
# Compile the model
model %>%
compile(optimizer = 'rmsprop', loss = 'mse', metrics = c('accuracy'))
# Fit the model
model %>%
fit(x = train_x, y = train_y, epochs = 25, batch_size=32, validation_split = .2)
# Evaluate the model
score <- model %>%
evaluate(test_x, test_y)
# Call up the accuracy
score$acc
# Predict based on your model
predictedclasses <- model %>%
predict_classes(newdata_x)
# Print predicted classes with customers' names
rownames(predictedclasses) <- c('Jasmit', 'Banjeet')
predictedclasses
# Fit the model and define callbacks
model %>%
fit(x = train_x, y = train_y,epochs = 25, batch_size = 32, validation_split = .2,
callbacks = callback_tensorboard("logs/run_1")
)
# Call TensorBoard
tensorboard("logs/run_1")
# Train the model
train(dnnclassifier, input_fn = shopping_input_function(shopper_train))
# Evaluate the model by correcting the error
evaluate(dnnclassifier, input_fn = shopping_input_function(shopper_test))
# Create a sequential model and the network architecture
ourdnnmodel <- keras_model_sequential() %>%
layer_dense(units = 10, activation = "relu", input_shape = ncol(train_x)) %>%
layer_dense(units = 5, activation = "relu") %>%
layer_dense(units = 1) %>%
compile(optimizer = 'rmsprop', loss = 'mse', metrics = c("mae", "accuracy"))
# Fit your model
learn <- ourdnnmodel %>%
fit(x = train_x, y = train_y, epochs = 25, batch_size = 32, validation_split = 0.2, verbose = FALSE)
# Run the learn function
learn
Chapter 4 - Deep Learning in TensorFlow: Increasing Model Accuracy
L2 Regularization Using Keras:
Dropout Technique Using TFEstimators:
Hyperparameter Tuning with tfruns:
Wrap Up:
Example code includes:
# Define the model
model_lesson1 <- keras_model_sequential()
# Add the regularizer
model_lesson1 %>%
layer_dense(units=15, activation='relu', input_shape=8, kernel_regularizer=regularizer_l2(l=0.1)) %>%
layer_dense(units=5, activation = 'relu') %>%
layer_dense(units=1)
# Compile the model
model_lesson1 %>%
compile(optimizer = 'rmsprop', loss = 'mse', metrics = c('accuracy'))
# Fit the model
model_lesson1 %>%
fit(x = train_x, y = train_y, epochs = 25, batch_size = 32, validation_split=0.2)
# Evaluate the model
score_lesson1 <- model_lesson1 %>%
evaluate(test_x, test_y)
# Call the accuracy and loss
score_lesson1$acc
score_lesson1$loss
# Define the feature columns
featcols <- feature_columns(
tf$feature_column$numeric_column("Var"), tf$feature_column$numeric_column("Skew"),
tf$feature_column$numeric_column("Kurt"), tf$feature_column$numeric_column("Entropy")
)
# Create the input function
banknote_input_fn <- function(data){
input_fn(data, features = c("Var", "Skew", "Kurt", "Entropy"), response = "Class")
}
# Create your dnn_classifier model
mymodel <- dnn_classifier(feature_columns = featcols, hidden_units = c(40, 60, 10), n_classes = 2,
label_vocabulary = c("N", "Y"), dropout = 0.2
)
# Train the model
train(mymodel, input_fn = banknote_input_fn(banknote_authentication_train))
# Evaluate your model using the testing dataset
final_evaluation <- evaluate(mymodel, input_fn = banknote_input_fn(banknote_authentication_test))
# Call up the accuracy and precision of your evaluated model
final_evaluation$accuracy
final_evaluation$precision
# Tune the run
runs <- tuning_run(modelsourcecode_script, flags = list(dropout = c(0.2, 0.3, 0.4)))
# View the outcome
runs[order(runs$eval_accuracy, decreasing = TRUE), ]
# Tune the run
runs <- tuning_run(
modelsourcecode_script, flags = list(dropout = c(0.2, 0.3, 0.4), activation = c("relu", "softmax") )
)
# View the outcome
runs[order(runs$eval_accuracy, decreasing = TRUE), ]
Chapter 1 - Introduction to Market Basket Analysis
Market Basket Introduction:
Item Combinations:
What is Market Basket Analysis?
Example code includes:
Online_Retail_2011_Q1 <- readr::read_csv("./RInputFiles/Online_Retail_2011_Q1.xls")
str(Online_Retail_2011_Q1)
movie_subset <- readr::read_csv("./RInputFiles/Movie_subset.xls")
str(movie_subset)
# Have a glimpse at the dataset
glimpse(Online_Retail_2011_Q1)
# Filter a single basket
One_basket = Online_Retail_2011_Q1 %>%
filter(InvoiceNo == 540180)
print(One_basket)
# Basket size
n_distinct(One_basket$StockCode)
# Total number of items purchased
One_basket %>%
summarize(sum(Quantity))
# Plot the total number of items within the basket
ggplot(One_basket, aes(x=reorder(Description, Quantity, function(x) sum(x)), y = Quantity)) +
geom_col() +
coord_flip() +
xlab("Items")
# Number of items
n_items = 10
# Initialize an empty matrix
combi = matrix(NA, nrow = n_items+1, ncol = 2)
# Loop over all values of k
for (i in 0:n_items){
combi[i+1, ] = c(i, choose(n_items, i))
}
# Sum over all values of k
sum(combi[, 2])
# Total number of possible baskets
2^10
# Define number of items
n_items = 100
# Specify the function to be plotted
fun_combi = function(x) choose(n_items, x)
# Plot the number of combinations
ggplot(data = data.frame(x = 0), mapping = aes(x = x)) +
stat_function(fun = fun_combi) + xlim(0, n_items)
# Select two baskets
Two_baskets = Online_Retail_2011_Q1 %>%
filter(InvoiceNo %in% c(540160, 540017))
# Basket size
Two_baskets %>%
group_by(InvoiceNo) %>%
summarise(n_total = n(), n_items = n_distinct(StockCode))
Online_Retail_clean <- Online_Retail_2011_Q1[complete.cases(Online_Retail_2011_Q1), ]
str(Online_Retail_clean)
# Create dataset with basket counts and inspect results
basket_size = Online_Retail_clean %>%
group_by(InvoiceNo) %>%
summarise(n_total = n(), n_items = n_distinct(StockCode))
head(basket_size)
# Calculate average values
basket_size %>%
summarize(avg_total_items = mean(n_total), avg_dist_items = mean(n_items))
# Distribution of distinct items in baskets
ggplot(basket_size, aes(x=n_items)) +
geom_histogram() + ggtitle("Distribution of basket sizes")
# Number of total and distinct items for HERB MARKER THYME
Online_Retail_clean %>%
filter(Description == "HERB MARKER THYME") %>%
summarise(n_tot_items = n(), n_basket_item = n_distinct(InvoiceNo))
# Number of baskets containing both items
Online_Retail_clean %>%
filter(Description %in% c("HERB MARKER ROSEMARY", "HERB MARKER THYME")) %>%
group_by(InvoiceNo) %>%
summarise(n = n()) %>%
filter(n==2) %>%
summarise(n_distinct(InvoiceNo))
Chapter 2 - Metrics and Techniques in Market Basket Analysis
Transactional Data:
Metrics in Market Basket Analysis:
The Apriori Algorithm:
Using Apriori for “if this then that”:
Example code includes:
library(arules)
# Splitting transactions
data_list = split(Online_Retail_clean$Description, Online_Retail_clean$InvoiceNo)
# Transform data into a transactional dataset
Online_trx = as(data_list, "transactions")
# Summary of transactions
summary(Online_trx)
# inspect first 3 transactions
inspect(head(Online_trx, 3))
# inspect last 5 transactions
inspect(tail(Online_trx, 5))
# inspect transaction 10
inspect(Online_trx[10])
# Inspect specific transactions
inspect(Online_trx[c(12, 20, 22)])
# Determine the support of both items with support 0.1
support_rosemary_thyme <- apriori(Online_trx, parameter = list(target = "frequent itemsets", supp = 0.1),
appearance = list(items = c("HERB MARKER ROSEMARY", "HERB MARKER THYME"))
)
# Inspect the itemsets
inspect(support_rosemary_thyme)
# Determine the support of both items with support 0.01
support_rosemary_thyme <- apriori(Online_trx, parameter = list(target = "frequent itemsets", supp = 0.01),
appearance = list(items = c("HERB MARKER ROSEMARY", "HERB MARKER THYME"))
)
# Inspect the itemsets
inspect(support_rosemary_thyme)
# Frequent itemsets for all items
support_all <- apriori(Online_trx, parameter = list(target="frequent itemsets", supp = 0.01))
# Inspect the 5 most frequent items
inspect(head(sort(support_all, by="support"), 5))
# Call the apriori function with apropriate parameters
rules_all <- apriori(Online_trx, parameter = list(supp=0.01, conf = 0.4))
# Call the apriori function with apropriate parameters
rules_all <- apriori(Online_trx, parameter = list(supp=0.01, conf = 0.4, minlen=2))
# Inspect the rules with highest confidence
inspect(head(sort(rules_all, by="confidence"), 5))
# Inspect the rules with highest lift
inspect(head(sort(rules_all, by="lift"), 5))
# Find the confidence and lift measures
rules_rosemary_rhs <- apriori(Online_trx, parameter = list(supp=0.01, conf=0.5, minlen=2),
appearance = list(rhs="HERB MARKER ROSEMARY", default = "lhs")
)
# Inspect the rules
inspect(rules_rosemary_rhs)
# Find the confidence and lift measures
rules_rosemary_lhs <- apriori(Online_trx, parameter = list(supp=0.01, conf=0.5, minlen=2),
appearance = list(lhs="HERB MARKER ROSEMARY", default = "rhs")
)
# Inspect the rules
inspect(rules_rosemary_lhs)
# Create the union of the rules and inspect
rules_rosemary <- arules::union(rules_rosemary_rhs, rules_rosemary_lhs)
inspect(rules_rosemary)
# Apply the apriori function to the Online retail dataset
rules_online <- apriori(Online_trx, parameter = list(supp = 0.01, conf = 0.8, minlen = 2))
# Inspect the first 5 rules
inspect(head(rules_online, 5))
# Inspect the first 5 rules with highest lift
inspect(head(sort(rules_online, by="lift"), 5))
# Transform the rules back to a dataframe
rules_online_df <- as(rules_online, "data.frame")
# Check the first records
head(rules_online_df)
# Apply the apriori function to the Online retail dataset
rules_online <- apriori(Online_trx, parameter = list(supp = 0.01, conf = 0.8, minlen = 2))
# Inspect the first rules
inspect(head(rules_online))
# Support of herb markers
supp_herb_markers <- apriori(Online_trx, parameter = list(target = "frequent itemsets", supp = 0.01),
appearance = list(items = c("HERB MARKER THYME", "HERB MARKER ROSEMARY"))
)
# Inspect frequent itemsets
inspect(supp_herb_markers)
# Extract rules for HERB MARKER THYME on rhs of rule
rules_thyme_marker_rhs <- apriori(Online_trx, parameter = list(supp=0.01, conf=0.8, minlen=2),
appearance = list(rhs = "HERB MARKER THYME"), control = list(verbose=F)
)
# Inspect rules
inspect(rules_thyme_marker_rhs)
# Extract rules for HERB MARKER THYME on lhs of rule
rules_thyme_marker_lhs <- apriori(Online_trx, parameter = list(supp=0.01, conf=0.8, minlen=2),
appearance = list(lhs = "HERB MARKER THYME"), control = list (verbose=F)
)
# Inspect rules
inspect(rules_thyme_marker_lhs)
# Apply the apriori function to the Online retail dataset
rules <- apriori(Online_trx, parameter = list(supp = 0.01, conf = 0.8, minlen = 2))
# Inspect the first 5 rules
inspect(head(rules))
# Find out redundant of rules
redundant_rules <- is.redundant(rules)
# Inspect the non redundant rules
non_redundant_rules <- rules[!redundant_rules]
inspect(head(non_redundant_rules))
Chapter 3 - Visualization in Market Basket Analysis
Items in the Basket:
Visualizing Metrics:
Rules to Graph-Based Visualizations:
Alternative Rule Plots:
Example code includes:
# Display items horizontally
itemFrequencyPlot(Online_trx, topN = 5, horiz = TRUE)
# Changing the font of the items
itemFrequencyPlot(Online_trx, topN = 10, col = rainbow(10), type = "relative", horiz = TRUE,
main = "Relative Item Frequency Plot" ,xlab = "Frequency", cex.names = 0.8
)
library(arulesViz)
# Inspection of the rules
inspectDT(rules_online)
# Create a standard scatterplot
plot(rules_online)
# Change the axis and legend of the scatterplot
plot(rules_online, measure = c("confidence", "lift"), shading = "support")
# Plot a two-key plot
plot(rules_online, method = "two-key plot")
# Plot a matrix plot
plot(rules_online, method = "matrix")
# Plot a matrix plot with confidence as color coding
plot(rules_online, method = "matrix", shading = "confidence")
# Create a HTML widget of the graph of rules
plot(rules_online, method = "graph", engine = "htmlwidget")
# HTML widget graph for the highest confidence rules
plot(head(sort(rules_online, by="confidence"), 5), method = "graph", engine = "htmlwidget")
# HTML widget graph for rules with lowest lift
plot(tail(sort(rules_online, by="lift"), 5), method = "graph", engine = "htmlwidget")
# Create an interactive graph visualization
rules_html <- plot(rules_online, method = "graph", engine = "htmlwidget")
# Save the interactive graph as an html file
# htmlwidgets::saveWidget(rules_html, file = "./RInputFiles/rules_grocery.html")
# Plot a group matrix-based visualization
# plot(subset_rules, method = "grouped")
# Change the arguments of group matrix-based visualization
# plot(subset_rules, method = "grouped", measure = "lift", shading = "confidence")
# Plotting the parallel coordinate plots
plot(rules_online, method = "paracoord")
# Parallel coordinate plots with confidence as color coding
plot(rules_online, method = "paracoord", shading = "confidence")
Chapter 4 - Case Study: Market Basket with Movies
Recap on Transactions:
Mining Association Rules:
Visualizing Transactions and Rules:
Making the most of Market Basket Analysis:
Wrap Up:
Example code includes:
# Have a glimpse at the dataset
movie_subset %>%
glimpse()
# Calculate the number of distinct users and movies
n_distinct(movie_subset$userId)
n_distinct(movie_subset$movieId)
# Distribution of the number of movies watched by users
movie_subset %>%
group_by(userId) %>%
summarize(nb_movies = n_distinct(movieId)) %>%
ggplot(aes(x=nb_movies)) +
geom_histogram() +
ggtitle("Distribution of number of movies watched")
# Split dataset into movies and users
data_list <- split(movie_subset$title, movie_subset$userId)
# Transform data into a transactional dataset
movie_trx <- as(data_list, "transactions")
# Plot of the item matrix
image(movie_trx[1:100,1:100])
# Setting the plot configuration option
par(mfrow=c(2, 1))
# Plot the relative and absolute item frequency plot
itemFrequencyPlot(movie_trx, type = "relative", topN = 10, horiz = TRUE, main = 'Relative item frequency')
itemFrequencyPlot(movie_trx, type = "absolute", topN = 10, horiz = TRUE, main = 'Absolute item frequency')
par(mfrow=c(1, 1))
# Extract the set of most frequent itemsets
itemsets <- apriori(movie_trx, parameter = list(support = 0.4, target = 'frequent itemsets'))
# Inspect the five most popular items
arules::inspect(sort(itemsets, by='support', decreasing = TRUE)[1:5])
# Extract the set of most frequent itemsets
itemsets_minlen2 <- apriori(movie_trx, parameter = list(support = 0.3, minlen = 2, target = 'frequent'))
# Inspect the five most popular items
arules::inspect(sort(itemsets_minlen2, by='support', decreasing = TRUE)[1:5])
# Set of confidence levels
confidenceLevels <- seq(from=0.95, to=0.5, by=-0.05)
# Create empty vector
rules_sup04 <- NULL
rules_sup03 <- NULL
# Apriori algorithm with a support level of 40% and 30%
for (i in 1:length(confidenceLevels)) {
rules_sup04[i] = length(apriori(movie_trx,
parameter=list(sup=0.4, conf=confidenceLevels[i], target="rules")
)
)
rules_sup03[i] = length(apriori(movie_trx,
parameter=list(sup=0.3, conf=confidenceLevels[i], target="rules")
)
)
}
# Number of rules found with a support level of 40%
qplot(confidenceLevels, rules_sup04, geom=c("point", "line"), xlab="Confidence level",
ylab="Number of rules found",main="Apriori with a support level of 40%"
) +
theme_bw()
# Create Data frame containing all results
nb_rules <- data.frame(rules_sup04, rules_sup03, confidenceLevels)
# Number of rules found with a support level of 40% and 30%
ggplot(data=nb_rules, aes(x=confidenceLevels)) +
# Lines and points for rules_sup04
geom_line(aes(y=rules_sup04, colour="Support level of 40%")) +
geom_point(aes(y=rules_sup04, colour="Support level of 40%")) +
# Lines and points for rules_sup03
geom_line(aes(y=rules_sup03, colour="Support level of 30%")) +
geom_point(aes(y=rules_sup03, colour="Support level of 30%")) +
# Polishing the graph
theme_bw() + ylab("") +
ggtitle("Number of extracted rules with apriori")
# Extract rules with the apriori
rules_movies <- apriori(movie_trx, parameter = list(supp = 0.3, conf = 0.9, minlen = 2, target = "rules"))
# Summary of extracted rules
summary(rules_movies)
# Create redudant rules and filter from extracted rules
rules_red <- is.redundant(rules_movies)
rules.pruned <- rules_movies[!rules_red]
# Inspect the non-redundant rules with highest confidence
arules::inspect(head(sort(rules.pruned, by="confidence")))
# Plot rules as scatterplot
plot(rules_movies, measure = c("confidence", "lift"), shading = "support", jitter = 1, engine = "html")
# Interactive matrix-based plot
plot(rules_movies, method = "matrix", shading ="confidence", engine = "html")
# Grouped matrix plot of rules
plot(rules_movies, method = "grouped", measure = "lift", shading = "confidence")
# Parallel coordinate plots with confidence as color coding
plot(rules_movies, method = "paracoord", shading = "confidence")
# Plot movie rules as a graph
plot(rules_movies, method = "graph", engine = "htmlwidget")
# Retrieve the top 10 rules with highest confidence
top10_rules_movies = head(sort(rules_movies, by = "confidence"), 10)
# Plot as an interactive graph the top 10 rules
plot(top10_rules_movies, method = "graph", engine = "htmlwidget")
# Extract rules with Pulp Fiction on the right side
pulpfiction_rules_rhs <- apriori(movie_trx, parameter = list(supp = 0.3, conf = 0.5),
appearance = list(default = "lhs", rhs = "Pulp Fiction")
)
# Inspect the first rules
arules::inspect(head(pulpfiction_rules_rhs))
arules::inspect(head(sort(pulpfiction_rules_rhs, by="lift"), 10))
# Extract rules with Pulp Fiction on the left side
pulpfiction_rules_lhs <- apriori(movie_trx, parameter = list(supp = 0.3, conf = 0.5),
appearance = list(default = "rhs", lhs = "Pulp Fiction")
)
# Summary of extracted rules
summary(pulpfiction_rules_lhs)
# Inspect the first rules
arules::inspect(head(pulpfiction_rules_lhs))
# Extract rules with Pulp Fiction on the left side
pulpfiction_rules_lhs <- apriori(movie_trx, parameter = list(supp = 0.3, conf = 0.5, minlen = 2),
appearance = list(default = "rhs", lhs = "Pulp Fiction")
)
# Inspect the first rules
arules::inspect(head(pulpfiction_rules_lhs))